From d21042662089e6215cb7d0bc3df73263c2cff5e8 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Wed, 21 Aug 2024 17:00:07 -0300 Subject: [PATCH 01/30] Reorder imports, standard library first --- tests/test_validate_yaml_schemas.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/test_validate_yaml_schemas.py b/tests/test_validate_yaml_schemas.py index 4cd91fc..16147e4 100644 --- a/tests/test_validate_yaml_schemas.py +++ b/tests/test_validate_yaml_schemas.py @@ -1,10 +1,11 @@ +import glob import json +from urllib.parse import urlparse +import yaml + import jsonschema import pytest -import glob -import yaml import requests -from urllib.parse import urlparse YAMLS_DIR = "../dags/ro_dou/dag_confs" @@ -32,6 +33,7 @@ def _is_valid_url(url): SCHEMA = get_schema(SCHEMA_FILEPATH) + @pytest.mark.parametrize( "data_file", [ From 5991071092d00eb6f3db626b601d3baf7c140a58 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Wed, 21 Aug 2024 17:00:50 -0300 Subject: [PATCH 02/30] Remove unused import --- tests/parsers_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/parsers_test.py b/tests/parsers_test.py index 442f1ed..daa0681 100644 --- a/tests/parsers_test.py +++ b/tests/parsers_test.py @@ -5,7 +5,7 @@ import sys import inspect import textwrap -import yaml + import pytest currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) From 05b77f556c45ded060cff47bd3421d0af68fc9db Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Wed, 21 Aug 2024 17:01:47 -0300 Subject: [PATCH 03/30] Add module docstring --- tests/test_validate_yaml_schemas.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_validate_yaml_schemas.py b/tests/test_validate_yaml_schemas.py index 16147e4..2c1c08c 100644 --- a/tests/test_validate_yaml_schemas.py +++ b/tests/test_validate_yaml_schemas.py @@ -1,3 +1,6 @@ +"""Test validation of yaml files according to the defined schemas. +""" + import glob import json from urllib.parse import urlparse From f651f0be1fd551ec5d7a8b2cc908ea0e62222d1f Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Wed, 21 Aug 2024 18:09:24 -0300 Subject: [PATCH 04/30] Create Pydantic schemas, migrate the tests that validate yaml files Co-authored-by: Gustavo --- src/schemas.py | 136 ++++++++++++++++++++++++++++ tests/test_validate_yaml_schemas.py | 42 +++------ 2 files changed, 148 insertions(+), 30 deletions(-) create mode 100644 src/schemas.py diff --git a/src/schemas.py b/src/schemas.py new file mode 100644 index 0000000..6a88809 --- /dev/null +++ b/src/schemas.py @@ -0,0 +1,136 @@ +""" +This module defines the Pydantic models for validating the structure of +the YAML files used in the application. + +The main classes are: + +- `SearchTerms`: search terms in the YAML file. +- `Search`: search configuration in the YAML file. +- `Report`: report configuration in the YAML file. +- `DAG`: DAG defined in the YAML file. +- `Config`: overall configuration in the YAML file. + +These models are used to validate the YAML files using the Pydantic +library. +""" + +from typing import List, Optional, Union +from pydantic import AnyHttpUrl, BaseModel, EmailStr, Field + + +class DBSelect(BaseModel): + """Represents the structure of the 'from_db_select' field in the YAML file.""" + + sql: str = Field(description="SQL query to fetch the search terms") + conn_id: str = Field(description="Airflow connection ID to use for the SQL query") + + +class SearchTerms(BaseModel): + """Represents the search terms in the YAML file.""" + + from_airflow_variable: Optional[str] = Field( + default=None, + description="Variável do Airflow a ser usada como termos de pesquisa", + ) + from_db_select: Optional[DBSelect] = Field( + default=None, + description="Consulta SQL para buscar os termos de pesquisa em um " + "banco de dados", + ) + + +class SearchField(BaseModel): + """Represents the field for search in the YAML file.""" + + description: str + value: str + + +class Search(BaseModel): + """Represents the search configuration in the YAML file.""" + + header: Optional[str] = Field( + default=None, description="Cabeçalho da consulta de pesquisa" + ) + sources: Optional[List[str]] = Field( + default=["DOU"], + description="Lista de fontes de dados para pesquisar (Querido Diário [QD], " + "Diário Oficial da União [DOU], INLABS). Default: DOU.", + ) + territory_id: Optional[int] = Field( + default=None, + description="ID do território no Querido Diário para filtragem " + "baseada em localização", + ) + terms: Union[List[str], SearchTerms] = Field( + description="Lista de termos de pesquisa ou uma forma de buscá-los" + ) + department: Optional[List[str]] = Field( + default=None, description="Lista de departamentos para filtrar a pesquisa" + ) + + +class Report(BaseModel): + """Represents the report configuration in the YAML file.""" + + slack: Optional[dict] = Field( + default=None, description="Configuração do webhook do Slack para relatórios" + ) + discord: Optional[dict] = Field( + default=None, description="Configuração do webhook do Discord para relatórios" + ) + emails: Optional[List[EmailStr]] = Field( + default=None, description="Lista de endereços de e-mail para enviar o relatório" + ) + attach_csv: Optional[bool] = Field( + default=None, + description="Se deve anexar um arquivo CSV com os resultados da pesquisa", + ) + subject: Optional[str] = Field( + default=None, description="Assunto do relatório por e-mail" + ) + skip_null: Optional[bool] = Field( + default=None, + description="Se deve pular a notificação de resultados nulos/vazios", + ) + hide_filters: Optional[bool] = Field( + default=None, description="Se deve ocultar os filtros aplicados no relatório" + ) + header_text: Optional[str] = Field( + default=None, description="Texto a ser incluído no cabeçalho do relatório" + ) + footer_text: Optional[str] = Field( + default=None, description="Texto a ser incluído no rodapé do relatório" + ) + no_results_found_text: Optional[str] = Field( + default=None, description="Texto a ser exibido quando não há resultados" + ) + + +class DAG(BaseModel): + """Represents the DAG configuration in the YAML file.""" + + id: str = Field(description="Nome único da DAG") + description: str = Field(description="Descrição da DAG") + tags: Optional[List[str]] = Field( + default=[], description="Lista de tags para filtragem da DAG no Airflow" + ) + owner: Optional[List[str]] = Field( + default=[], description="Lista de owners para filtragem da DAG no Airflow" + ) + schedule: Optional[str] = Field(default=None, description="Expressão cron") + dataset: Optional[str] = Field(default=None, description="Nome do Dataset") + search: Union[List[Search], Search] = Field( + description="Seção para definição da busca no Diário" + ) + doc_md: Optional[str] = Field(default="", description="description") + report: Report = Field( + description="Aceita: `slack`, `discord`, `emails`, `attach_csv`, " + "`subject`, `skip_null`" + ) + + +class Config(BaseModel): + """Represents the overall configuration in the YAML file.""" + + dag: DAG = Field(description="Instanciação da DAG") diff --git a/tests/test_validate_yaml_schemas.py b/tests/test_validate_yaml_schemas.py index 2c1c08c..0c0d335 100644 --- a/tests/test_validate_yaml_schemas.py +++ b/tests/test_validate_yaml_schemas.py @@ -2,39 +2,18 @@ """ import glob -import json -from urllib.parse import urlparse -import yaml +import os +import sys -import jsonschema +from pydantic import ValidationError import pytest -import requests +import yaml +# add module path so we can import from other modules +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) +from schemas import Config YAMLS_DIR = "../dags/ro_dou/dag_confs" -SCHEMA_FILEPATH = "../schemas/ro-dou.json" -# or -# SCHEMA_FILEPATH = "https://raw.githubusercontent.com/gestaogovbr/Ro-dou/main/schemas/ro-dou.json" - - -def get_schema(filepath): - def _is_valid_url(url): - try: - result = urlparse(url) - return all([result.scheme, result.netloc]) - except ValueError: - return False - - if _is_valid_url(filepath): - response = requests.get(filepath) - response.raise_for_status() - return json.loads(response.text) - else: - with open(filepath) as f: - return json.load(f) - - -SCHEMA = get_schema(SCHEMA_FILEPATH) @pytest.mark.parametrize( @@ -45,8 +24,11 @@ def _is_valid_url(url): + glob.glob(f"{YAMLS_DIR}/**/*.yaml", recursive=True) ], ) -def test_json_schema_validation(data_file): +def test_pydantic_validation(data_file): with open(data_file) as data_fp: data = yaml.safe_load(data_fp) - jsonschema.validate(instance=data, schema=SCHEMA) + try: + Config(**data) + except ValidationError as e: + pytest.fail(f"YAML file {data_file} is not valid:\n{e}") From dbda6e643cbcea4a73a282bf2deb8c78d7a83b5b Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 14:58:48 -0300 Subject: [PATCH 05/30] Reorder and remove unused imports --- src/parsers.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/parsers.py b/src/parsers.py index 269f2d5..456e523 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -1,13 +1,12 @@ """Abstract and concrete classes to parse DAG configuration from a file.""" -import ast -import os -import textwrap from abc import ABC, abstractmethod +import ast from dataclasses import dataclass -from typing import List, Set, Tuple, Union - +import textwrap +from typing import List, Set, Tuple import yaml + from airflow import Dataset from airflow.models import Variable From b515ce36e040fd3dfa8040c2473af1e39395d50c Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 14:59:10 -0300 Subject: [PATCH 06/30] Apply black formatting --- src/parsers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/parsers.py b/src/parsers.py index 456e523..d6df589 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -10,6 +10,7 @@ from airflow import Dataset from airflow.models import Variable + @dataclass class SearchConfig: header: str @@ -57,6 +58,8 @@ class FileParser(ABC): @abstractmethod def parse(self): pass + + class YAMLParser(FileParser): """Parses YAML file and get the DAG parameters. From e73c6560475572b92e4c39bc42926546ad9b84b4 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 15:00:04 -0300 Subject: [PATCH 07/30] Define method directly (remove unnecessary indirection) --- src/parsers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/parsers.py b/src/parsers.py index d6df589..3c5dff0 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -71,9 +71,6 @@ def __init__(self, filepath: str): self.filepath = filepath def parse(self) -> DAGConfig: - return self._parse_yaml() - - def _parse_yaml(self) -> DAGConfig: """Processes the config file in order to instantiate the DAG in Airflow. """ From 26c2a3a7ecebaf7b5b837832cfa3c86e5c718159 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 16:41:21 -0300 Subject: [PATCH 08/30] Rename Pydantic models to disambiguate --- src/schemas.py | 14 +++++++------- tests/test_validate_yaml_schemas.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/schemas.py b/src/schemas.py index 6a88809..faadafd 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -46,7 +46,7 @@ class SearchField(BaseModel): value: str -class Search(BaseModel): +class SearchConfig(BaseModel): """Represents the search configuration in the YAML file.""" header: Optional[str] = Field( @@ -70,7 +70,7 @@ class Search(BaseModel): ) -class Report(BaseModel): +class ReportConfig(BaseModel): """Represents the report configuration in the YAML file.""" slack: Optional[dict] = Field( @@ -107,7 +107,7 @@ class Report(BaseModel): ) -class DAG(BaseModel): +class DAGConfig(BaseModel): """Represents the DAG configuration in the YAML file.""" id: str = Field(description="Nome único da DAG") @@ -120,17 +120,17 @@ class DAG(BaseModel): ) schedule: Optional[str] = Field(default=None, description="Expressão cron") dataset: Optional[str] = Field(default=None, description="Nome do Dataset") - search: Union[List[Search], Search] = Field( + search: Union[List[SearchConfig], SearchConfig] = Field( description="Seção para definição da busca no Diário" ) doc_md: Optional[str] = Field(default="", description="description") - report: Report = Field( + report: ReportConfig = Field( description="Aceita: `slack`, `discord`, `emails`, `attach_csv`, " "`subject`, `skip_null`" ) -class Config(BaseModel): +class RoDouConfig(BaseModel): """Represents the overall configuration in the YAML file.""" - dag: DAG = Field(description="Instanciação da DAG") + dag: DAGConfig = Field(description="Instanciação da DAG") diff --git a/tests/test_validate_yaml_schemas.py b/tests/test_validate_yaml_schemas.py index 0c0d335..fedfadd 100644 --- a/tests/test_validate_yaml_schemas.py +++ b/tests/test_validate_yaml_schemas.py @@ -11,7 +11,7 @@ # add module path so we can import from other modules sys.path.insert(0, os.path.abspath(os.path.dirname(__file__))) -from schemas import Config +from schemas import RoDouConfig YAMLS_DIR = "../dags/ro_dou/dag_confs" @@ -29,6 +29,6 @@ def test_pydantic_validation(data_file): data = yaml.safe_load(data_fp) try: - Config(**data) + RoDouConfig(**data) except ValidationError as e: pytest.fail(f"YAML file {data_file} is not valid:\n{e}") From 12bdc44a29c62c1d8600b6ac365a0074c4cd4a43 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 16:44:47 -0300 Subject: [PATCH 09/30] Remove unnecessary class abstraction --- src/parsers.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/parsers.py b/src/parsers.py index 3c5dff0..1dbd853 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -1,6 +1,5 @@ """Abstract and concrete classes to parse DAG configuration from a file.""" -from abc import ABC, abstractmethod import ast from dataclasses import dataclass import textwrap @@ -10,6 +9,8 @@ from airflow import Dataset from airflow.models import Variable +from schemas import RoDouConfig + @dataclass class SearchConfig: @@ -52,15 +53,7 @@ class DAGConfig: no_results_found_text: str -class FileParser(ABC): - """Abstract class to build file parsers with DAG configuration.""" - - @abstractmethod - def parse(self): - pass - - -class YAMLParser(FileParser): +class YAMLParser: """Parses YAML file and get the DAG parameters. It guarantees that mandatory fields are in place and are properly From 862eebda8349d2f3cb3514b08d3f24d2371bd275 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 16:49:30 -0300 Subject: [PATCH 10/30] Use json values instead of Python in Airflow variables --- src/parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parsers.py b/src/parsers.py index 1dbd853..345e2e0 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -1,7 +1,7 @@ """Abstract and concrete classes to parse DAG configuration from a file.""" -import ast from dataclasses import dataclass +import json import textwrap from typing import List, Set, Tuple import yaml @@ -161,7 +161,7 @@ def _get_terms_params(self, search) -> Tuple[List[str], str, str]: if "from_airflow_variable" in terms: var_value = Variable.get(terms.get("from_airflow_variable")) try: - terms = ast.literal_eval(var_value) + terms = json.loads(var_value) except (ValueError, SyntaxError): terms = var_value.splitlines() elif "from_db_select" in terms: From f19baf09120ebc18ac126f10ade7a3a6bd052a49 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 16:52:57 -0300 Subject: [PATCH 11/30] Factor out file read into its own method --- src/parsers.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/parsers.py b/src/parsers.py index 345e2e0..327a58c 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -63,13 +63,17 @@ class YAMLParser: def __init__(self, filepath: str): self.filepath = filepath + def read(self) -> dict: + """Reads the contents of the YAML file.""" + with open(self.filepath, "r", encoding="utf-8") as file: + dag_config_dict = yaml.safe_load(file) + return dag_config_dict + def parse(self) -> DAGConfig: """Processes the config file in order to instantiate the DAG in Airflow. """ - with open(self.filepath, "r") as file: - dag_config_dict = yaml.safe_load(file) - + dag_config_dict = self.read() dag = self._try_get(dag_config_dict, "dag") dag_id = self._try_get(dag, "id") description = self._try_get(dag, "description") From 0252124f26238cc19631c804cc6dd43406c47d23 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 18:34:04 -0300 Subject: [PATCH 12/30] Add missing parameter in example --- dag_confs/examples_and_tests/all_parameters_example.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dag_confs/examples_and_tests/all_parameters_example.yaml b/dag_confs/examples_and_tests/all_parameters_example.yaml index eba3db9..7acba27 100644 --- a/dag_confs/examples_and_tests/all_parameters_example.yaml +++ b/dag_confs/examples_and_tests/all_parameters_example.yaml @@ -9,6 +9,7 @@ dag: - pessoa 2 schedule: 0 8 * * MON-FRI search: + header: Pesquisa no DOU terms: - dados abertos - governo aberto From 7d91d1ce1f765d62a2478eb769a1b0290937495e Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Thu, 22 Aug 2024 18:35:18 -0300 Subject: [PATCH 13/30] Refactor code to use Pydantic schema (partial) --- src/dou_dag_generator.py | 22 +++---- src/notification/discord_sender.py | 19 +++--- src/notification/email_sender.py | 48 ++++++++------- src/notification/notifier.py | 18 ++++-- src/notification/slack_sender.py | 30 ++++++---- src/parsers.py | 94 ++++++++++++++++-------------- src/schemas.py | 47 +++++++++++++++ 7 files changed, 175 insertions(+), 103 deletions(-) diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py index cfafdb9..3d20a6c 100755 --- a/src/dou_dag_generator.py +++ b/src/dou_dag_generator.py @@ -60,7 +60,7 @@ def _merge_dict(dict1, dict2): def result_as_html(specs: DAGConfig) -> bool: """Só utiliza resultado HTML apenas para email""" - return specs.discord_webhook and specs.slack_webhook + return specs.report.discord["webhook"] and specs.report.slack["webhook"] class DouDigestDagGenerator: @@ -122,7 +122,7 @@ def prepare_doc_md(specs: DAGConfig, config_file: str) -> str: Returns: str: The DAG documentation in markdown format. """ - config = asdict(specs) + config = specs.model_dump() # options that won't show in the "DAG Docs" del config["description"] del config["doc_md"] @@ -170,7 +170,7 @@ def _get_safe_schedule(self, specs: DAGConfig, default_schedule: str) -> str: """ schedule = default_schedule - id_based_minute = self._hash_dag_id(specs.dag_id, 60) + id_based_minute = self._hash_dag_id(specs.id, 60) schedule_without_min = " ".join(schedule.split(" ")[1:]) schedule = f"{id_based_minute} {schedule_without_min}" @@ -232,7 +232,7 @@ def generate_dags(self): for filepath in files_list: dag_specs = self.parser(filepath).parse() - dag_id = dag_specs.dag_id + dag_id = dag_specs.id globals()[dag_id] = self.create_dag(dag_specs, filepath) def perform_searches( @@ -356,7 +356,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: """ # Prepare the markdown documentation doc_md = ( - self.prepare_doc_md(specs, config_file) if specs.doc_md else specs.doc_md + self.prepare_doc_md(specs, config_file) if specs.doc_md else None ) # DAG parameters default_args = { @@ -372,14 +372,14 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: schedule = self._update_schedule(specs) dag = DAG( - specs.dag_id, + specs.id, default_args=default_args, schedule=schedule, description=specs.description, doc_md=doc_md, catchup=False, params={"trigger_date": "2022-01-02T12:00"}, - tags=specs.dag_tags, + tags=specs.tags, ) with dag: @@ -427,9 +427,8 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: ) if subsearch["sql"]: - ( - select_terms_from_db_task >> exec_search_task - ) # pylint: disable=pointless-statement + # pylint: disable=pointless-statement + select_terms_from_db_task >> exec_search_task has_matches_task = BranchPythonOperator( task_id="has_matches", @@ -443,7 +442,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: ] ) + ") }}", - "skip_null": specs.skip_null, + "skip_null": specs.report.skip_null, }, ) @@ -465,6 +464,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: }, ) + # pylint: disable=pointless-statement tg_exec_searchs >> has_matches_task has_matches_task >> [send_notification_task, skip_notification_task] diff --git a/src/notification/discord_sender.py b/src/notification/discord_sender.py index 5c99a76..d96b99d 100644 --- a/src/notification/discord_sender.py +++ b/src/notification/discord_sender.py @@ -1,17 +1,20 @@ -import requests import re + +import requests + from notification.isender import ISender +from schemas import ReportConfig class DiscordSender(ISender): highlight_tags = ("__", "__") - def __init__(self, specs) -> None: - self.webhook_url = specs.discord_webhook - self.hide_filters = specs.hide_filters - self.header_text = specs.header_text - self.footer_text = specs.footer_text - self.no_results_found_text = specs.no_results_found_text + def __init__(self, report_config: ReportConfig) -> None: + self.webhook_url = report_config.discord["webhook"] + self.hide_filters = report_config.hide_filters + self.header_text = report_config.header_text + self.footer_text = report_config.footer_text + self.no_results_found_text = report_config.no_results_found_text def send(self, search_report: list, report_date: str = None): """Parse the content, and send message to Discord""" @@ -68,4 +71,4 @@ def _remove_html_tags(self, text): # Define a regular expression pattern to match HTML tags clean = re.compile('<.*?>') # Substitute HTML tags with an empty string - return re.sub(clean, '', text) \ No newline at end of file + return re.sub(clean, '', text) diff --git a/src/notification/email_sender.py b/src/notification/email_sender.py index 81280c7..c03c2b4 100644 --- a/src/notification/email_sender.py +++ b/src/notification/email_sender.py @@ -1,7 +1,10 @@ +"""Module for sending emails. +""" + import os import sys -import textwrap from tempfile import NamedTemporaryFile +import textwrap import markdown import pandas as pd @@ -14,12 +17,17 @@ sys.path.insert(0, parent_dir) from notification.isender import ISender +from schemas import ReportConfig class EmailSender(ISender): + """Prepare and send e-mails with the reports.""" + highlight_tags = ("", "") - def __init__(self, specs) -> None: - self.specs = specs + + def __init__(self, report_config: ReportConfig) -> None: + self.report_config = report_config + self.search_report = "" self.watermark = """

Esta pesquisa foi realizada automaticamente pelo Ro-DOU @@ -29,7 +37,7 @@ def __init__(self, specs) -> None: def send(self, search_report: list, report_date: str): """Builds the email content, the CSV if applies, and send it""" self.search_report = search_report - full_subject = f"{self.specs.subject} - DOs de {report_date}" + full_subject = f"{self.report_config.subject} - DOs de {report_date}" skip_notification = True for search in self.search_report: @@ -37,20 +45,20 @@ def send(self, search_report: list, report_date: str): if items: skip_notification = False else: - content = self.specs.no_results_found_text + content = self.report_config.no_results_found_text if skip_notification: - if self.specs.skip_null: + if self.report_config.skip_null: return "skip_notification" else: content = self.generate_email_content() content += self.watermark - if self.specs.attach_csv and skip_notification is False: + if self.report_config.attach_csv and skip_notification is False: with self.get_csv_tempfile() as csv_file: send_email( - to=self.specs.emails, + to=self.report_config.emails, subject=full_subject, files=[csv_file.name], html_content=content, @@ -58,7 +66,7 @@ def send(self, search_report: list, report_date: str): ) else: send_email( - to=self.specs.emails, + to=self.report_config.emails, subject=full_subject, html_content=content, mime_charset="utf-8", @@ -73,18 +81,18 @@ def generate_email_content(self) -> str: parent_directory = os.path.dirname(current_directory) file_path = os.path.join(parent_directory, "report_style.css") - with open(file_path, "r") as f: + with open(file_path, "r", encoding="utf-8") as f: blocks = [f""] - if self.specs.header_text: - blocks.append(self.specs.header_text) + if self.report_config.header_text: + blocks.append(self.report_config.header_text) for search in self.search_report: if search["header"]: blocks.append(f"

{search['header']}

") - if not self.specs.hide_filters: + if not self.report_config.hide_filters: if search["department"]: blocks.append( """

Filtrando resultados somente para:

""" @@ -97,11 +105,9 @@ def generate_email_content(self) -> str: for group, results in search["result"].items(): if not results: - blocks.append( - f"

{self.specs.no_results_found_text}.

" - ) + blocks.append(f"

{self.report_config.no_results_found_text}.

") else: - if not self.specs.hide_filters: + if not self.report_config.hide_filters: if group != "single_group": blocks.append("\n") blocks.append(f"**Grupo: {group}**") @@ -109,12 +115,12 @@ def generate_email_content(self) -> str: for term, items in results.items(): blocks.append("\n") - if not self.specs.hide_filters: + if not self.report_config.hide_filters: blocks.append(f"* # Resultados para: {term}") for item in items: - if not self.specs.hide_filters: + if not self.report_config.hide_filters: sec_desc = item["section"] item_html = f"""

{sec_desc}

@@ -131,8 +137,8 @@ def generate_email_content(self) -> str: blocks.append(textwrap.dedent(item_html)) blocks.append("---") - if self.specs.footer_text: - blocks.append(self.specs.footer_text) + if self.report_config.footer_text: + blocks.append(self.report_config.footer_text) return markdown.markdown("\n".join(blocks)) diff --git a/src/notification/notifier.py b/src/notification/notifier.py index 84f8264..81feea3 100644 --- a/src/notification/notifier.py +++ b/src/notification/notifier.py @@ -26,15 +26,21 @@ class Notifier: def __init__(self, specs: DAGConfig) -> None: self.senders = [] - if specs.emails: - self.senders.append(EmailSender(specs)) - if specs.discord_webhook: - self.senders.append(DiscordSender(specs)) - if specs.slack_webhook: - self.senders.append(SlackSender(specs)) + if specs.report.emails: + self.senders.append(EmailSender(specs.report)) + if specs.report.discord["webhook"]: + self.senders.append(DiscordSender(specs.report)) + if specs.report.slack["webhook"]: + self.senders.append(SlackSender(specs.report)) def send_notification(self, search_report: str, report_date: str): + """Sends the notification to the specified email, Discord or Slack + + Args: + search_report (str): The report to be sent + report_date (str): The date of the report + """ # Convert to data structure after it's retrieved from xcom search_report = ast.literal_eval(search_report) diff --git a/src/notification/slack_sender.py b/src/notification/slack_sender.py index 18b51d6..110d191 100644 --- a/src/notification/slack_sender.py +++ b/src/notification/slack_sender.py @@ -1,20 +1,27 @@ +"""Send reports to Slack. +""" + from datetime import datetime +import re import requests -import re from notification.isender import ISender +from schemas import ReportConfig + class SlackSender(ISender): + """Prepare a report and send it to Slack. + """ highlight_tags = ("*", "*") - def __init__(self, specs) -> None: - self.webhook_url = specs.slack_webhook + def __init__(self, report_config: ReportConfig) -> None: + self.webhook_url = report_config.slack["webhook"] self.blocks = [] - self.hide_filters = specs.hide_filters - self.header_text = specs.header_text - self.footer_text = specs.footer_text - self.no_results_found_text = specs.no_results_found_text + self.hide_filters = report_config.hide_filters + self.header_text = report_config.header_text + self.footer_text = report_config.footer_text + self.no_results_found_text = report_config.no_results_found_text def send(self, search_report: list, report_date: str = None): """Parse the content, and send message to Slack""" @@ -37,9 +44,7 @@ def send(self, search_report: list, report_date: str = None): for item in items: self._add_block(item) else: - self._add_text( - self.no_results_found_text - ) + self._add_text(self.no_results_found_text) if self.footer_text: footer_text = _remove_html_tags(self.footer_text) @@ -111,8 +116,9 @@ def _format_date(date_str: str) -> str: _from, _to = WEEKDAYS_EN_TO_PT[date.weekday()] return date.strftime("%a %d/%m").replace(_from, _to) + def _remove_html_tags(text): # Define a regular expression pattern to match HTML tags - clean = re.compile('<.*?>') + clean = re.compile("<.*?>") # Substitute HTML tags with an empty string - return re.sub(clean, '', text) \ No newline at end of file + return re.sub(clean, "", text) diff --git a/src/parsers.py b/src/parsers.py index 327a58c..b257aa0 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -1,56 +1,57 @@ """Abstract and concrete classes to parse DAG configuration from a file.""" -from dataclasses import dataclass +# from dataclasses import dataclass import json import textwrap -from typing import List, Set, Tuple +from typing import List, Tuple import yaml from airflow import Dataset from airflow.models import Variable -from schemas import RoDouConfig - - -@dataclass -class SearchConfig: - header: str - sources: List[str] - territory_id: int - dou_sections: List[str] - field: str - search_date: str - is_exact_search: bool - ignore_signature_match: bool - force_rematch: bool - full_text: bool - use_summary: bool - terms: List[str] - sql: str - conn_id: str - department: List[str] - - -@dataclass -class DAGConfig: - dag_id: str - search: List[SearchConfig] - emails: List[str] - subject: str - attach_csv: bool - discord_webhook: str - slack_webhook: str - schedule: str - dataset: str - description: str - skip_null: bool - doc_md: str - dag_tags: Set[str] - owner: str - hide_filters: bool - header_text: str - footer_text: str - no_results_found_text: str +from schemas import RoDouConfig, DAGConfig + + +# TODO: remove old dataclasses +# @dataclass +# class SearchConfig: +# header: str +# sources: List[str] +# territory_id: int +# dou_sections: List[str] +# field: str +# search_date: str +# is_exact_search: bool +# ignore_signature_match: bool +# force_rematch: bool +# full_text: bool +# use_summary: bool +# terms: List[str] +# sql: str +# conn_id: str +# department: List[str] + + +# @dataclass +# class DAGConfig: +# dag_id: str +# search: List[SearchConfig] +# emails: List[str] +# subject: str +# attach_csv: bool +# discord_webhook: str +# slack_webhook: str +# schedule: str +# dataset: str +# description: str +# skip_null: bool +# doc_md: str +# dag_tags: Set[str] +# owner: str +# hide_filters: bool +# header_text: str +# footer_text: str +# no_results_found_text: str class YAMLParser: @@ -73,7 +74,10 @@ def parse(self) -> DAGConfig: """Processes the config file in order to instantiate the DAG in Airflow. """ - dag_config_dict = self.read() + config = RoDouConfig(**self.read()) + return config.dag + + # TODO: remove old parser code dag = self._try_get(dag_config_dict, "dag") dag_id = self._try_get(dag, "id") description = self._try_get(dag, "description") diff --git a/src/schemas.py b/src/schemas.py index faadafd..00e073b 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -14,6 +14,7 @@ library. """ +import textwrap from typing import List, Optional, Union from pydantic import AnyHttpUrl, BaseModel, EmailStr, Field @@ -65,6 +66,52 @@ class SearchConfig(BaseModel): terms: Union[List[str], SearchTerms] = Field( description="Lista de termos de pesquisa ou uma forma de buscá-los" ) + field: Optional[str] = Field( + default="TUDO", + description="Campos dos quais os termos devem ser pesquisados. " + "Valores: TUDO, TITULO, CONTEUDO. Default: TUDO", + ) + is_exact_search: Optional[bool] = Field( + default=True, + description="Busca somente o termo exato. Valores: True ou False. " + "Default: True.", + ) + ignore_signature_match: Optional[bool] = Field( + default=False, + description="Busca somente o termo exato. Valores: True ou False. " + "Default: True.", + ) + date: Optional[str] = Field( + default="DIA", + description="Intervalo de data para busca. Valores: DIA, SEMANA, " + "MES, ANO. Default: DIA", + ) + dou_sections: Optional[List[str]] = Field( + default=["TODOS"], + description=textwrap.dedent( + """ + Seção do Diário Oficial a procurar: + + - SECAO_1 + - SECAO_2 + - SECAO_3 + - EDICAO_EXTRA + - EDICAO_EXTRA_1A + - EDICAO_EXTRA_1B + - EDICAO_EXTRA_1D + - EDICAO_EXTRA_2A + - EDICAO_EXTRA_2B + - EDICAO_EXTRA_2D + - EDICAO_EXTRA_3A + - EDICAO_EXTRA_3B + - EDICAO_EXTRA_3D + - EDICAO_SUPLEMENTAR + - TODOS + + Default: TODOS + """ + ), + ) department: Optional[List[str]] = Field( default=None, description="Lista de departamentos para filtrar a pesquisa" ) From 8b42676cbe67d07644a3d0f353ca523b913db3bc Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 16:48:41 -0300 Subject: [PATCH 14/30] Rename class to fetch search terms and fix its docstring --- src/schemas.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/schemas.py b/src/schemas.py index 00e073b..d6d5fa1 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -26,8 +26,9 @@ class DBSelect(BaseModel): conn_id: str = Field(description="Airflow connection ID to use for the SQL query") -class SearchTerms(BaseModel): - """Represents the search terms in the YAML file.""" +class FetchTermsConfig(BaseModel): + """Represents configuration information for fetching search terms from + a data source.""" from_airflow_variable: Optional[str] = Field( default=None, @@ -63,7 +64,7 @@ class SearchConfig(BaseModel): description="ID do território no Querido Diário para filtragem " "baseada em localização", ) - terms: Union[List[str], SearchTerms] = Field( + terms: Union[List[str], FetchTermsConfig] = Field( description="Lista de termos de pesquisa ou uma forma de buscá-los" ) field: Optional[str] = Field( From 0fb524323ac2b0295bf30c88e914d2000f8ae2f7 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 17:39:16 -0300 Subject: [PATCH 15/30] Remove unused import --- src/dou_dag_generator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py index 3d20a6c..36862c0 100755 --- a/src/dou_dag_generator.py +++ b/src/dou_dag_generator.py @@ -13,7 +13,6 @@ import os import sys import textwrap -from dataclasses import asdict from datetime import datetime, timedelta from typing import Dict, List, Optional, Union import json From e0143f78854e143ddb8de9671067d9e12aed6603 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 17:39:53 -0300 Subject: [PATCH 16/30] Apply black formatting --- src/dou_dag_generator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py index 36862c0..733676d 100755 --- a/src/dou_dag_generator.py +++ b/src/dou_dag_generator.py @@ -202,8 +202,7 @@ def _update_schedule( if schedule is None: schedule = self._get_safe_schedule( - specs=specs, - default_schedule=self.DEFAULT_SCHEDULE + specs=specs, default_schedule=self.DEFAULT_SCHEDULE ) is_default_schedule = True else: From 2f342f39f56e9047ef90e3a4fa556f39c3f4bd97 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 17:40:20 -0300 Subject: [PATCH 17/30] Add missing fields in Pydantic schemas --- src/schemas.py | 54 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/src/schemas.py b/src/schemas.py index d6d5fa1..4f0e4d0 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -64,24 +64,6 @@ class SearchConfig(BaseModel): description="ID do território no Querido Diário para filtragem " "baseada em localização", ) - terms: Union[List[str], FetchTermsConfig] = Field( - description="Lista de termos de pesquisa ou uma forma de buscá-los" - ) - field: Optional[str] = Field( - default="TUDO", - description="Campos dos quais os termos devem ser pesquisados. " - "Valores: TUDO, TITULO, CONTEUDO. Default: TUDO", - ) - is_exact_search: Optional[bool] = Field( - default=True, - description="Busca somente o termo exato. Valores: True ou False. " - "Default: True.", - ) - ignore_signature_match: Optional[bool] = Field( - default=False, - description="Busca somente o termo exato. Valores: True ou False. " - "Default: True.", - ) date: Optional[str] = Field( default="DIA", description="Intervalo de data para busca. Valores: DIA, SEMANA, " @@ -116,6 +98,42 @@ class SearchConfig(BaseModel): department: Optional[List[str]] = Field( default=None, description="Lista de departamentos para filtrar a pesquisa" ) + terms: Union[List[str], FetchTermsConfig] = Field( + description="Lista de termos de pesquisa ou uma forma de buscá-los" + ) + field: Optional[str] = Field( + default="TUDO", + description="Campos dos quais os termos devem ser pesquisados. " + "Valores: TUDO, TITULO, CONTEUDO. Default: TUDO", + ) + is_exact_search: Optional[bool] = Field( + default=True, + description="Busca somente o termo exato. Valores: True ou False. " + "Default: True.", + ) + ignore_signature_match: Optional[bool] = Field( + default=False, + description="Busca somente o termo exato. Valores: True ou False. " + "Default: True.", + ) + force_rematch: Optional[bool] = Field( + default=False, + description="Indica que a busca deve ser forçada, mesmo que já " + "tenha sido feita anteriormente. Valores: True ou False. " + "Default: False.", + ) + full_text: Optional[bool] = Field( + default=False, + description="Define se no relatório será exibido o texto completo, " + "ao invés de um resumo. Valores: True ou False. Default: False. " + "(Funcionalidade disponível apenas no INLABS)", + ) + use_summary: Optional[bool] = Field( + default=False, + description="Define se no relatório será exibido a ementa, se existir. " + "Valores: True ou False. Default: False. " + "(Funcionalidade disponível apenas no INLABS)", + ) class ReportConfig(BaseModel): From 555cf6aade9b145df6358088ce999d2aaa714be6 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 17:40:44 -0300 Subject: [PATCH 18/30] Use Pydantic schemas in DAG generation --- src/dou_dag_generator.py | 84 +++++++++++++++++++++--------------- src/notification/notifier.py | 4 +- 2 files changed, 51 insertions(+), 37 deletions(-) diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py index 733676d..e2955c0 100755 --- a/src/dou_dag_generator.py +++ b/src/dou_dag_generator.py @@ -34,6 +34,7 @@ from utils.date import get_trigger_date, template_ano_mes_dia_trigger_local_time from notification.notifier import Notifier from parsers import DAGConfig, YAMLParser +from schemas import FetchTermsConfig from searchers import BaseSearcher, DOUSearcher, QDSearcher, INLABSSearcher @@ -59,7 +60,7 @@ def _merge_dict(dict1, dict2): def result_as_html(specs: DAGConfig) -> bool: """Só utiliza resultado HTML apenas para email""" - return specs.report.discord["webhook"] and specs.report.slack["webhook"] + return specs.report.discord and specs.report.slack class DouDigestDagGenerator: @@ -353,9 +354,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: the term_list from a database """ # Prepare the markdown documentation - doc_md = ( - self.prepare_doc_md(specs, config_file) if specs.doc_md else None - ) + doc_md = self.prepare_doc_md(specs, config_file) if specs.doc_md else None # DAG parameters default_args = { "owner": specs.owner, @@ -383,48 +382,63 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: with dag: with TaskGroup(group_id="exec_searchs") as tg_exec_searchs: - counter = 0 - for subsearch in specs.search: - counter += 1 - if subsearch["sql"]: + + # is it a single search or a list of searchers? + if isinstance(specs.search, list): + searches = specs.search + else: + searches = [specs.search] + + for counter, subsearch in enumerate(searches, 1): + + # are terms to be fetched from a database? + terms_come_from_db: bool = isinstance( + subsearch.terms, FetchTermsConfig + ) and getattr(subsearch.terms, "from_db_select", None) + + # determine the terms list + term_list = [] + # is it a directly defined list of terms or is it a + # configuration for fetching terms from a data source? + if isinstance(subsearch.terms, list): + term_list = subsearch.terms + elif terms_come_from_db: select_terms_from_db_task = PythonOperator( task_id=f"select_terms_from_db_{counter}", python_callable=self.select_terms_from_db, op_kwargs={ - "sql": subsearch["sql"], - "conn_id": subsearch["conn_id"], + "sql": subsearch.terms.from_db_select.sql, + "conn_id": subsearch.terms.from_db_select.conn_id, }, ) - term_list = ( - "{{ ti.xcom_pull(task_ids='exec_searchs.select_terms_from_db_" - + str(counter) - + "') }}" - ) + term_list = ( + "{{ ti.xcom_pull(task_ids='exec_searchs.select_terms_from_db_" + + str(counter) + + "') }}" + ) exec_search_task = PythonOperator( task_id=f"exec_search_{counter}", python_callable=self.perform_searches, op_kwargs={ - "header": subsearch["header"], - "sources": subsearch["sources"], - "territory_id": subsearch["territory_id"], - "term_list": subsearch["terms"] or term_list, - "dou_sections": subsearch["dou_sections"], - "search_date": subsearch["search_date"], - "field": subsearch["field"], - "is_exact_search": subsearch["is_exact_search"], - "ignore_signature_match": subsearch[ - "ignore_signature_match" - ], - "force_rematch": subsearch["force_rematch"], - "full_text": subsearch["full_text"], - "use_summary": subsearch["use_summary"], - "department": subsearch["department"], + "header": subsearch.header, + "sources": subsearch.sources, + "territory_id": subsearch.territory_id, + "term_list": term_list, + "dou_sections": subsearch.dou_sections, + "search_date": subsearch.date, + "field": subsearch.field, + "is_exact_search": subsearch.is_exact_search, + "ignore_signature_match": subsearch.ignore_signature_match, + "force_rematch": subsearch.force_rematch, + "full_text": subsearch.full_text, + "use_summary": subsearch.use_summary, + "department": subsearch.department, "result_as_email": result_as_html(specs), }, ) - if subsearch["sql"]: + if terms_come_from_db: # pylint: disable=pointless-statement select_terms_from_db_task >> exec_search_task @@ -435,8 +449,8 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: "search_result": "{{ ti.xcom_pull(task_ids=" + str( [ - f"exec_searchs.exec_search_{count + 1}" - for count in range(counter) + f"exec_searchs.exec_search_{count}" + for count in range(1, len(searches) + 1) ] ) + ") }}", @@ -453,8 +467,8 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG: "search_report": "{{ ti.xcom_pull(task_ids=" + str( [ - f"exec_searchs.exec_search_{count + 1}" - for count in range(counter) + f"exec_searchs.exec_search_{count}" + for count in range(1, len(searches) + 1) ] ) + ") }}", diff --git a/src/notification/notifier.py b/src/notification/notifier.py index 81feea3..604801d 100644 --- a/src/notification/notifier.py +++ b/src/notification/notifier.py @@ -28,9 +28,9 @@ def __init__(self, specs: DAGConfig) -> None: self.senders = [] if specs.report.emails: self.senders.append(EmailSender(specs.report)) - if specs.report.discord["webhook"]: + if specs.report.discord: self.senders.append(DiscordSender(specs.report)) - if specs.report.slack["webhook"]: + if specs.report.slack: self.senders.append(SlackSender(specs.report)) From a77d9ecaf3dda241583df412ec50a4e87c1a9778 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 18:16:44 -0300 Subject: [PATCH 19/30] Fix field defaults --- src/schemas.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/schemas.py b/src/schemas.py index 4f0e4d0..65aaf0a 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -149,18 +149,22 @@ class ReportConfig(BaseModel): default=None, description="Lista de endereços de e-mail para enviar o relatório" ) attach_csv: Optional[bool] = Field( - default=None, - description="Se deve anexar um arquivo CSV com os resultados da pesquisa", + default=False, + description="Se deve anexar um arquivo CSV com os resultados da pesquisa." + "Default: False.", ) subject: Optional[str] = Field( default=None, description="Assunto do relatório por e-mail" ) skip_null: Optional[bool] = Field( - default=None, - description="Se deve pular a notificação de resultados nulos/vazios", + default=True, + description="Se deve pular a notificação de resultados nulos/vazios. " + "Default: True.", ) hide_filters: Optional[bool] = Field( - default=None, description="Se deve ocultar os filtros aplicados no relatório" + default=False, + description="Se deve ocultar os filtros aplicados no relatório." + "Default: False.", ) header_text: Optional[str] = Field( default=None, description="Texto a ser incluído no cabeçalho do relatório" @@ -169,7 +173,8 @@ class ReportConfig(BaseModel): default=None, description="Texto a ser incluído no rodapé do relatório" ) no_results_found_text: Optional[str] = Field( - default=None, description="Texto a ser exibido quando não há resultados" + default="Nenhum dos termos pesquisados foi encontrado nesta consulta", + description="Texto a ser exibido quando não há resultados", ) From 6d43f7c7ec20b7aff066888393da7bb2e78ebfd0 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 18:17:25 -0300 Subject: [PATCH 20/30] Add field validator to search parameter, so that it's always a list --- src/schemas.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/schemas.py b/src/schemas.py index 65aaf0a..14bca74 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -17,6 +17,7 @@ import textwrap from typing import List, Optional, Union from pydantic import AnyHttpUrl, BaseModel, EmailStr, Field +from pydantic import field_validator class DBSelect(BaseModel): @@ -200,6 +201,19 @@ class DAGConfig(BaseModel): "`subject`, `skip_null`" ) + @field_validator("search") + @staticmethod + def cast_to_list( + search_param: Union[List[SearchConfig], SearchConfig] + ) -> List[SearchConfig]: + """Cast the value of "search" parameter to always be a list. + If the yaml configuration file does not use a list, convert to + a list with a single search. + """ + if not isinstance(search_param, list): + return [search_param] + return search_param + class RoDouConfig(BaseModel): """Represents the overall configuration in the YAML file.""" From 05deb93fa13219ddef59de35012cae530f58b89d Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 18:17:47 -0300 Subject: [PATCH 21/30] Fix webhook in Discord mock send test --- tests/discord_sender_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/discord_sender_test.py b/tests/discord_sender_test.py index ebd3b55..645107d 100644 --- a/tests/discord_sender_test.py +++ b/tests/discord_sender_test.py @@ -12,7 +12,7 @@ def mocked_specs(): Specs = namedtuple( "Specs", [ - "discord_webhook", + "discord", "hide_filters", "header_text", "footer_text", @@ -20,7 +20,7 @@ def mocked_specs(): ], ) return Specs( - WEBHOOK, + {"webhook": WEBHOOK}, False, None, None, From 09b7a7804253711c81e9a2f87b864fa295671ef5 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Fri, 23 Aug 2024 18:18:19 -0300 Subject: [PATCH 22/30] Fix expected data structure in test parameters (partial) --- tests/parsers_test.py | 82 ++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/tests/parsers_test.py b/tests/parsers_test.py index daa0681..8698e1f 100644 --- a/tests/parsers_test.py +++ b/tests/parsers_test.py @@ -14,14 +14,14 @@ from dou_dag_generator import DouDigestDagGenerator, YAMLParser, DAGConfig - @pytest.mark.parametrize( "filepath, result_tuple", [ ( "basic_example.yaml", { - "dag_id": "basic_example", + "id": "basic_example", + "description": "DAG de teste", "search": [ { "terms": [ @@ -39,34 +39,37 @@ "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": None, } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": False, - "discord_webhook": None, - "slack_webhook": None, - "schedule": None, - "dataset": None, - "description": "DAG de teste", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "generated_dag"}, - "owner": "", - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": False, + "discord_webhook": None, + "slack_webhook": None, + "schedule": None, + "dataset": None, + "description": "DAG de teste", + "skip_null": True, + "doc_md": None, + "dag_tags": {"dou", "generated_dag"}, + "owner": [], + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados " + "foi encontrado nesta consulta", + }, }, ), ( "all_parameters_example.yaml", { - "dag_id": "all_parameters_example", + "id": "all_parameters_example", "search": [ { "terms": [ @@ -111,7 +114,7 @@ ( "terms_from_db_example.yaml", { - "dag_id": "terms_from_db_example", + "id": "terms_from_db_example", "search": [ { "terms": [], @@ -149,7 +152,7 @@ "skip_null": True, "doc_md": None, "dag_tags": {"dou", "generated_dag"}, - "owner": "", + "owner": [], "hide_filters": False, "header_text": None, "footer_text": None, @@ -159,7 +162,7 @@ ( "basic_example_skip_null.yaml", { - "dag_id": "basic_example_skip_null", + "id": "basic_example_skip_null", "search": [ { "terms": ["cimentodaaroeira"], @@ -190,7 +193,7 @@ "skip_null": False, "doc_md": None, "dag_tags": {"dou", "generated_dag"}, - "owner": "", + "owner": [], "hide_filters": False, "header_text": None, "footer_text": None, @@ -200,7 +203,7 @@ ( "markdown_docs_example.yaml", { - "dag_id": "markdown_docs_example", + "id": "markdown_docs_example", "search": [ { "terms": [ @@ -243,7 +246,7 @@ * ou colocar [links](graph)!""" ).strip(), "dag_tags": {"dou", "generated_dag"}, - "owner": "", + "owner": [], "hide_filters": False, "header_text": None, "footer_text": None, @@ -253,7 +256,7 @@ ( "department_example.yaml", { - "dag_id": "department_example", + "id": "department_example", "search": [ { "terms": ["dados abertos"], @@ -287,7 +290,7 @@ "skip_null": True, "doc_md": None, "dag_tags": {"dou", "generated_dag"}, - "owner": "", + "owner": [], "hide_filters": False, "header_text": None, "footer_text": None, @@ -297,7 +300,7 @@ ( "inlabs_example.yaml", { - "dag_id": "inlabs_example", + "id": "inlabs_example", "search": [ { "terms": ["tecnologia", "informação"], @@ -338,7 +341,7 @@ ( "inlabs_advanced_search_example.yaml", { - "dag_id": "inlabs_advanced_search_example", + "id": "inlabs_advanced_search_example", "search": [ { "terms": [ @@ -382,7 +385,7 @@ ( "multiple_searchs_example.yaml", { - "dag_id": "multiple_searchs_example", + "id": "multiple_searchs_example", "search": [ { "terms": [ @@ -438,7 +441,7 @@ "skip_null": False, "doc_md": None, "dag_tags": {"dou", "generated_dag", "inlabs"}, - "owner": "", + "owner": [], "hide_filters": False, "header_text": None, "footer_text": None, @@ -448,7 +451,7 @@ ( "hide_filters_example.yaml", { - "dag_id": "hide_filters_example", + "id": "hide_filters_example", "search": [ { "terms": ["tecnologia", "informação"], @@ -482,7 +485,7 @@ "skip_null": True, "doc_md": None, "dag_tags": {"dou", "inlabs", "generated_dag"}, - "owner": "", + "owner": [], "hide_filters": True, "header_text": None, "footer_text": None, @@ -492,7 +495,7 @@ ( "header_and_footer_example.yaml", { - "dag_id": "header_and_footer_example", + "id": "header_and_footer_example", "search": [ { "terms": ["tecnologia", "informação"], @@ -523,7 +526,7 @@ "skip_null": True, "doc_md": None, "dag_tags": {"dou", "generated_dag"}, - "owner": "", + "owner": [], "hide_filters": False, "header_text": "

Greetings

", "footer_text": "

Best Regards

", @@ -532,11 +535,10 @@ ), ], ) - def test_parse(filepath, result_tuple): filepath = os.path.join( DouDigestDagGenerator().YAMLS_DIR, "examples_and_tests", filepath ) parsed = YAMLParser(filepath=filepath).parse() - assert parsed == DAGConfig(**result_tuple) + assert parsed.model_dump() == DAGConfig(**result_tuple).model_dump() From 1c7b265597273e681059b3f72f7a5be7e38cc30c Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Mon, 26 Aug 2024 16:57:54 -0300 Subject: [PATCH 23/30] Fix indentation --- src/notification/email_sender.py | 77 +++++++++++++++++--------------- src/notification/slack_sender.py | 2 +- 2 files changed, 43 insertions(+), 36 deletions(-) diff --git a/src/notification/email_sender.py b/src/notification/email_sender.py index 49ae352..fd177e8 100644 --- a/src/notification/email_sender.py +++ b/src/notification/email_sender.py @@ -104,42 +104,45 @@ def generate_email_content(self) -> str: for group, search_results in search["result"].items(): if not results: - blocks.append( - f"

{self.report_config.no_results_found_text}.

" - ) + blocks.append(f"

{self.report_config.no_results_found_text}.

") else: - if not self.report_config.hide_filters: - if group != "single_group": - blocks.append("\n") - blocks.append(f"**Grupo: {group}**") - blocks.append("\n\n") - - for term, term_results in results.items(): + if not self.report_config.hide_filters: + if group != "single_group": blocks.append("\n") - if not self.report_config.hide_filters: - blocks.append(f"* # Resultados para: {term}") - - for department, results in term_results.items(): - - if not self.report_config.hide_filters and department != 'single_department': - blocks.append(f"**{department}**") - - for result in results: - if not self.report_config.hide_filters: - sec_desc = result["section"] - item_html = f""" -

{sec_desc}

- ### [{result['title']}]({result['href']}) -

{result['abstract']}

-

{result['date']}

""" - blocks.append( - textwrap.indent(textwrap.dedent(item_html), " " * 4) + blocks.append(f"**Grupo: {group}**") + blocks.append("\n\n") + + for term, term_results in results.items(): + blocks.append("\n") + if not self.report_config.hide_filters: + blocks.append(f"* # Resultados para: {term}") + + for department, results in term_results.items(): + + if ( + not self.report_config.hide_filters + and department != "single_department" + ): + blocks.append(f"**{department}**") + + for result in results: + if not self.report_config.hide_filters: + sec_desc = result["section"] + item_html = f""" +

{sec_desc}

+ ### [{result['title']}]({result['href']}) +

{result['abstract']}

+

{result['date']}

""" + blocks.append( + textwrap.indent( + textwrap.dedent(item_html), " " * 4 ) - else: - item_html = f""" - ### [{result['title']}]({result['href']}) -

{result['abstract']}



""" - blocks.append(textwrap.dedent(item_html)) + ) + else: + item_html = f""" + ### [{result['title']}]({result['href']}) +

{result['abstract']}



""" + blocks.append(textwrap.dedent(item_html)) blocks.append("---") if self.report_config.footer_text: @@ -196,11 +199,15 @@ def convert_report_dict_to_tuple_list(self) -> list: for term, departments in results.items(): for department, dpt_matches in departments.items(): for match in dpt_matches: - tuple_list.append(repack_match(header, group, term, department, match)) + tuple_list.append( + repack_match(header, group, term, department, match) + ) return tuple_list -def repack_match(header: str, group: str, search_term: str, department: str, match: dict) -> tuple: +def repack_match( + header: str, group: str, search_term: str, department: str, match: dict +) -> tuple: return ( header, group, diff --git a/src/notification/slack_sender.py b/src/notification/slack_sender.py index cd3dd15..8b636d5 100644 --- a/src/notification/slack_sender.py +++ b/src/notification/slack_sender.py @@ -49,7 +49,7 @@ def send(self, search_report: list, report_date: str = None): for department, results in term_results.items(): if not self.hide_filters and department != 'single_department': - self._add_header(f"{department}") + self._add_header(f"{department}") for result in results: self._add_block(result) From 2cf73f937035449a760828dcc984b4d7f8f30865 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Mon, 26 Aug 2024 16:58:29 -0300 Subject: [PATCH 24/30] Fix default for doc_md --- src/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schemas.py b/src/schemas.py index 14bca74..7787b8d 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -195,7 +195,7 @@ class DAGConfig(BaseModel): search: Union[List[SearchConfig], SearchConfig] = Field( description="Seção para definição da busca no Diário" ) - doc_md: Optional[str] = Field(default="", description="description") + doc_md: Optional[str] = Field(default=None, description="description") report: ReportConfig = Field( description="Aceita: `slack`, `discord`, `emails`, `attach_csv`, " "`subject`, `skip_null`" From 9171c1ad0cdaaabab918acc7cd4d0f490ff4c6db Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Mon, 26 Aug 2024 16:58:49 -0300 Subject: [PATCH 25/30] Change tags data type to set --- src/schemas.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/schemas.py b/src/schemas.py index 7787b8d..5189717 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -15,7 +15,7 @@ """ import textwrap -from typing import List, Optional, Union +from typing import List, Optional, Set, Union from pydantic import AnyHttpUrl, BaseModel, EmailStr, Field from pydantic import field_validator @@ -184,8 +184,8 @@ class DAGConfig(BaseModel): id: str = Field(description="Nome único da DAG") description: str = Field(description="Descrição da DAG") - tags: Optional[List[str]] = Field( - default=[], description="Lista de tags para filtragem da DAG no Airflow" + tags: Optional[Set[str]] = Field( + default=[], description="Conjunto de tags para filtragem da DAG no Airflow" ) owner: Optional[List[str]] = Field( default=[], description="Lista de owners para filtragem da DAG no Airflow" @@ -214,6 +214,14 @@ def cast_to_list( return [search_param] return search_param + @field_validator("tags") + @staticmethod + def add_default_tags(tags_param: Optional[Set[str]]) -> Optional[Set[str]]: + """Add default tags to the list of tags.""" + if tags_param is not None: + tags_param.update({"dou", "generated_dag"}) + return tags_param + class RoDouConfig(BaseModel): """Represents the overall configuration in the YAML file.""" From 9cd7c73e21c991a2d0db00575e333c0b159e359e Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Mon, 26 Aug 2024 16:59:47 -0300 Subject: [PATCH 26/30] Fix expected data structure in test parameters (partial) --- tests/parsers_test.py | 73 +++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/tests/parsers_test.py b/tests/parsers_test.py index 8698e1f..db2125c 100644 --- a/tests/parsers_test.py +++ b/tests/parsers_test.py @@ -70,6 +70,12 @@ "all_parameters_example.yaml", { "id": "all_parameters_example", + "schedule": "0 8 * * MON-FRI", + "dataset": None, + "description": "DAG exemplo utilizando todos os demais parâmetros.", + "doc_md": None, + "tags": {"dou", "generated_dag", "projeto_a", "departamento_x"}, + "owner": ["pessoa 1", "pessoa 2"], "search": [ { "terms": [ @@ -77,13 +83,13 @@ "governo aberto", "lei de acesso à informação", ], - "header": None, + "header": "Pesquisa no DOU", "sources": ["DOU"], "sql": None, "conn_id": None, "territory_id": None, "dou_sections": ["SECAO_1", "EDICAO_SUPLEMENTAR"], - "search_date": "MES", + "date": "MES", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": True, @@ -93,28 +99,31 @@ "department": None, } ], - "emails": ["dest1@economia.gov.br", "dest2@economia.gov.br"], - "subject": "Assunto do Email", - "attach_csv": True, - "discord_webhook": None, - "slack_webhook": None, - "schedule": "0 8 * * MON-FRI", - "dataset": None, - "description": "DAG exemplo utilizando todos os demais parâmetros.", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "generated_dag", "projeto_a", "departamento_x"}, - "owner": "pessoa 1, pessoa 2", - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "skip_null": True, + "emails": ["dest1@economia.gov.br", "dest2@economia.gov.br"], + "subject": "Assunto do Email", + "attach_csv": True, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi " + "encontrado nesta consulta", + } }, ), ( "terms_from_db_example.yaml", { "id": "terms_from_db_example", + "description": "DAG de teste", + "doc_md": None, + "dag_tags": {"dou", "generated_dag"}, + "owner": [], + "schedule": None, + "dataset": None, "search": [ { "terms": [], @@ -141,22 +150,18 @@ "department": None, } ], - "emails": ["destination@economia.gov.br"], - "subject": "[String] com caracteres especiais deve estar entre aspas", - "attach_csv": True, - "discord_webhook": None, - "slack_webhook": None, - "schedule": None, - "dataset": None, - "description": "DAG de teste", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "generated_dag"}, - "owner": [], - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "[String] com caracteres especiais deve estar entre aspas", + "attach_csv": True, + "discord_webhook": None, + "slack_webhook": None, + "skip_null": True, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( From 6b9851e2ce85f9d89ab38ff50190a6de6dd62a06 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Mon, 26 Aug 2024 17:26:35 -0300 Subject: [PATCH 27/30] Fix default for dag tags --- src/schemas.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/schemas.py b/src/schemas.py index 5189717..196bdb5 100644 --- a/src/schemas.py +++ b/src/schemas.py @@ -185,7 +185,8 @@ class DAGConfig(BaseModel): id: str = Field(description="Nome único da DAG") description: str = Field(description="Descrição da DAG") tags: Optional[Set[str]] = Field( - default=[], description="Conjunto de tags para filtragem da DAG no Airflow" + default={"dou", "generated_dag"}, + description="Conjunto de tags para filtragem da DAG no Airflow", ) owner: Optional[List[str]] = Field( default=[], description="Lista de owners para filtragem da DAG no Airflow" @@ -216,10 +217,9 @@ def cast_to_list( @field_validator("tags") @staticmethod - def add_default_tags(tags_param: Optional[Set[str]]) -> Optional[Set[str]]: + def add_default_tags(tags_param: Optional[Set[str]]) -> Set[str]: """Add default tags to the list of tags.""" - if tags_param is not None: - tags_param.update({"dou", "generated_dag"}) + tags_param.update({"dou", "generated_dag"}) return tags_param From a178305eca621d17660af5645bea096135bff761 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Mon, 26 Aug 2024 17:26:48 -0300 Subject: [PATCH 28/30] Fix expected data structure in test parameters (partial) --- tests/parsers_test.py | 368 ++++++++++++++++++++++-------------------- 1 file changed, 192 insertions(+), 176 deletions(-) diff --git a/tests/parsers_test.py b/tests/parsers_test.py index db2125c..20c65e3 100644 --- a/tests/parsers_test.py +++ b/tests/parsers_test.py @@ -56,7 +56,7 @@ "description": "DAG de teste", "skip_null": True, "doc_md": None, - "dag_tags": {"dou", "generated_dag"}, + "tags": {"dou", "generated_dag"}, "owner": [], "hide_filters": False, "header_text": None, @@ -111,7 +111,7 @@ "footer_text": None, "no_results_found_text": "Nenhum dos termos pesquisados foi " "encontrado nesta consulta", - } + }, }, ), ( @@ -120,7 +120,7 @@ "id": "terms_from_db_example", "description": "DAG de teste", "doc_md": None, - "dag_tags": {"dou", "generated_dag"}, + "tags": {"dou", "generated_dag"}, "owner": [], "schedule": None, "dataset": None, @@ -140,13 +140,13 @@ "conn_id": "example_database_conn", "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "MES", + "date": "MES", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": None, } ], @@ -168,6 +168,12 @@ "basic_example_skip_null.yaml", { "id": "basic_example_skip_null", + "schedule": None, + "dataset": None, + "description": "DAG de teste", + "doc_md": None, + "tags": {"dou", "generated_dag"}, + "owner": [], "search": [ { "terms": ["cimentodaaroeira"], @@ -177,38 +183,48 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": None, } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": False, - "discord_webhook": None, - "slack_webhook": None, - "schedule": None, - "dataset": None, - "description": "DAG de teste", - "skip_null": False, - "doc_md": None, - "dag_tags": {"dou", "generated_dag"}, - "owner": [], - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": False, + "discord_webhook": None, + "slack_webhook": None, + "skip_null": False, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( "markdown_docs_example.yaml", { "id": "markdown_docs_example", + "schedule": None, + "dataset": None, + "description": "DAG com documentação em markdown", + "doc_md": textwrap.dedent( + """ + ## Ola! + Esta é uma DAG de exemplo com documentação em markdown. Esta descrição é opcional e pode ser definida no parâmetro `doc_md`. + + * Ah, aqui você também pode usar *markdown* para + * escrever listas, por exemplo, + * ou colocar [links](graph)!""" + ).strip(), + "tags": {"dou", "generated_dag"}, + "owner": [], "search": [ { "terms": [ @@ -222,46 +238,40 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": None, } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": False, - "discord_webhook": None, - "slack_webhook": None, - "schedule": None, - "dataset": None, - "description": "DAG com documentação em markdown", - "skip_null": True, - "doc_md": textwrap.dedent( - """ - ## Ola! - Esta é uma DAG de exemplo com documentação em markdown. Esta descrição é opcional e pode ser definida no parâmetro `doc_md`. - - * Ah, aqui você também pode usar *markdown* para - * escrever listas, por exemplo, - * ou colocar [links](graph)!""" - ).strip(), - "dag_tags": {"dou", "generated_dag"}, - "owner": [], - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": False, + "skip_null": True, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( "department_example.yaml", { "id": "department_example", + "schedule": None, + "dataset": None, + "description": "DAG de teste (filtro por departamento)", + "doc_md": None, + "tags": {"dou", "generated_dag"}, + "owner": [], "search": [ { "terms": ["dados abertos"], @@ -271,41 +281,43 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": [ "Ministério da Gestão e da Inovação em Serviços Públicos", "Ministério da Defesa", ], } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": False, - "discord_webhook": None, - "slack_webhook": None, - "schedule": None, - "dataset": None, - "description": "DAG de teste (filtro por departamento)", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "generated_dag"}, - "owner": [], - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": False, + "skip_null": True, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( "inlabs_example.yaml", { "id": "inlabs_example", + "schedule": "0 8 * * MON-FRI", + "dataset": "inlabs", + "description": "DAG de teste", + "doc_md": None, + "tags": {"dou", "generated_dag", "inlabs"}, + "owner": ["cdata"], "search": [ { "terms": ["tecnologia", "informação"], @@ -315,38 +327,41 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, + "force_rematch": False, + "full_text": False, "use_summary": True, "department": None, } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": True, - "discord_webhook": None, - "slack_webhook": None, - "schedule": "0 8 * * MON-FRI", - "dataset": "inlabs", - "description": "DAG de teste", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "generated_dag", "inlabs"}, - "owner": "cdata", - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": True, + "skip_null": True, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( "inlabs_advanced_search_example.yaml", { "id": "inlabs_advanced_search_example", + "schedule": None, + "dataset": "inlabs", + "description": "DAG de teste", + "skip_null": True, + "doc_md": None, + "tags": {"dou", "generated_dag", "inlabs"}, + "owner": ["cdata"], "search": [ { "terms": [ @@ -359,38 +374,39 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": None, } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": True, - "discord_webhook": None, - "slack_webhook": None, - "schedule": None, - "dataset": "inlabs", - "description": "DAG de teste", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "generated_dag", "inlabs"}, - "owner": "cdata", - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": True, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( "multiple_searchs_example.yaml", { "id": "multiple_searchs_example", + "schedule": "0 8 * * MON-FRI", + "dataset": None, + "description": "DAG de teste com múltiplas buscas", + "doc_md": None, + "tags": {"dou", "generated_dag", "inlabs"}, + "owner": [], "search": [ { "terms": [ @@ -404,13 +420,13 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": True, "force_rematch": True, - "full_text": None, - "use_summary": None, + "full_text": False, + "use_summary": False, "department": None, }, { @@ -425,38 +441,40 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": True, "force_rematch": True, - "full_text": None, - "use_summary": None, + "full_text": False, + "use_summary": False, "department": None, }, ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": False, - "discord_webhook": None, - "slack_webhook": None, - "schedule": "0 8 * * MON-FRI", - "dataset": None, - "description": "DAG de teste com múltiplas buscas", - "skip_null": False, - "doc_md": None, - "dag_tags": {"dou", "generated_dag", "inlabs"}, - "owner": [], - "hide_filters": False, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": False, + "skip_null": False, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": False, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( "hide_filters_example.yaml", { "id": "hide_filters_example", + "schedule": "0 8 * * MON-FRI", + "dataset": None, + "description": "DAG de teste", + "doc_md": None, + "tags": {"dou", "inlabs", "generated_dag"}, + "owner": [], "search": [ { "terms": ["tecnologia", "informação"], @@ -466,41 +484,43 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": [ "Ministério da Gestão e da Inovação em Serviços Públicos", "Ministério da Defesa", ], } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": True, - "discord_webhook": None, - "slack_webhook": None, - "schedule": "0 8 * * MON-FRI", - "dataset": None, - "description": "DAG de teste", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "inlabs", "generated_dag"}, - "owner": [], - "hide_filters": True, - "header_text": None, - "footer_text": None, - "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": True, + "skip_null": True, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": True, + "header_text": None, + "footer_text": None, + "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta", + }, }, ), ( "header_and_footer_example.yaml", { "id": "header_and_footer_example", + "schedule": "0 8 * * MON-FRI", + "dataset": None, + "description": "DAG de teste", + "doc_md": None, + "tags": {"dou", "generated_dag"}, + "owner": [], "search": [ { "terms": ["tecnologia", "informação"], @@ -510,32 +530,28 @@ "conn_id": None, "territory_id": None, "dou_sections": ["TODOS"], - "search_date": "DIA", + "date": "DIA", "field": "TUDO", "is_exact_search": True, "ignore_signature_match": False, - "force_rematch": None, - "full_text": None, - "use_summary": None, + "force_rematch": False, + "full_text": False, + "use_summary": False, "department": None, } ], - "emails": ["destination@economia.gov.br"], - "subject": "Teste do Ro-dou", - "attach_csv": False, - "discord_webhook": None, - "slack_webhook": None, - "schedule": "0 8 * * MON-FRI", - "dataset": None, - "description": "DAG de teste", - "skip_null": True, - "doc_md": None, - "dag_tags": {"dou", "generated_dag"}, - "owner": [], - "hide_filters": False, - "header_text": "

Greetings

", - "footer_text": "

Best Regards

", - "no_results_found_text": "No results found", + "report": { + "emails": ["destination@economia.gov.br"], + "subject": "Teste do Ro-dou", + "attach_csv": False, + "skip_null": True, + "discord_webhook": None, + "slack_webhook": None, + "hide_filters": False, + "header_text": "

Greetings

", + "footer_text": "

Best Regards

", + "no_results_found_text": "No results found", + }, }, ), ], From 0e003194601a7c375aedc3220653d15630146e21 Mon Sep 17 00:00:00 2001 From: Eduardo Lauer Date: Tue, 27 Aug 2024 11:44:24 -0300 Subject: [PATCH 29/30] fix fixture terms_from_db_example --- tests/parsers_test.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tests/parsers_test.py b/tests/parsers_test.py index 20c65e3..ef2bb4e 100644 --- a/tests/parsers_test.py +++ b/tests/parsers_test.py @@ -126,18 +126,22 @@ "dataset": None, "search": [ { - "terms": [], + "terms": { + "from_airflow_variable": None, + "from_db_select": { + "sql": ( + "SELECT 'cloroquina' as TERMO, 'Ações inefetivas' as GRUPO " + "UNION SELECT 'ivermectina' as TERMO, 'Ações inefetivas' as GRUPO " + "UNION SELECT 'vacina contra covid' as TERMO, 'Ações efetivas' as GRUPO " + "UNION SELECT 'higienização das mãos' as TERMO, 'Ações efetivas' as GRUPO " + "UNION SELECT 'uso de máscara' as TERMO, 'Ações efetivas' as GRUPO " + "UNION SELECT 'distanciamento social' as TERMO, 'Ações efetivas' as GRUPO\n" + ), + "conn_id": "example_database_conn", + } + }, "header": None, "sources": ["DOU"], - "sql": ( - "SELECT 'cloroquina' as TERMO, 'Ações inefetivas' as GRUPO " - "UNION SELECT 'ivermectina' as TERMO, 'Ações inefetivas' as GRUPO " - "UNION SELECT 'vacina contra covid' as TERMO, 'Ações efetivas' as GRUPO " - "UNION SELECT 'higienização das mãos' as TERMO, 'Ações efetivas' as GRUPO " - "UNION SELECT 'uso de máscara' as TERMO, 'Ações efetivas' as GRUPO " - "UNION SELECT 'distanciamento social' as TERMO, 'Ações efetivas' as GRUPO\n" - ), - "conn_id": "example_database_conn", "territory_id": None, "dou_sections": ["TODOS"], "date": "MES", From 71f0b9c6eea308a6c0aa33fcb235e5ec399cf037 Mon Sep 17 00:00:00 2001 From: Augusto Herrmann Date: Tue, 27 Aug 2024 12:09:31 -0300 Subject: [PATCH 30/30] Remove old dataclasses --- src/parsers.py | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/src/parsers.py b/src/parsers.py index b257aa0..f895c23 100644 --- a/src/parsers.py +++ b/src/parsers.py @@ -12,48 +12,6 @@ from schemas import RoDouConfig, DAGConfig -# TODO: remove old dataclasses -# @dataclass -# class SearchConfig: -# header: str -# sources: List[str] -# territory_id: int -# dou_sections: List[str] -# field: str -# search_date: str -# is_exact_search: bool -# ignore_signature_match: bool -# force_rematch: bool -# full_text: bool -# use_summary: bool -# terms: List[str] -# sql: str -# conn_id: str -# department: List[str] - - -# @dataclass -# class DAGConfig: -# dag_id: str -# search: List[SearchConfig] -# emails: List[str] -# subject: str -# attach_csv: bool -# discord_webhook: str -# slack_webhook: str -# schedule: str -# dataset: str -# description: str -# skip_null: bool -# doc_md: str -# dag_tags: Set[str] -# owner: str -# hide_filters: bool -# header_text: str -# footer_text: str -# no_results_found_text: str - - class YAMLParser: """Parses YAML file and get the DAG parameters.