diff --git a/dag_confs/examples_and_tests/all_parameters_example.yaml b/dag_confs/examples_and_tests/all_parameters_example.yaml
index eba3db9..7acba27 100644
--- a/dag_confs/examples_and_tests/all_parameters_example.yaml
+++ b/dag_confs/examples_and_tests/all_parameters_example.yaml
@@ -9,6 +9,7 @@ dag:
- pessoa 2
schedule: 0 8 * * MON-FRI
search:
+ header: Pesquisa no DOU
terms:
- dados abertos
- governo aberto
diff --git a/src/dou_dag_generator.py b/src/dou_dag_generator.py
index 321ca17..80ac49b 100755
--- a/src/dou_dag_generator.py
+++ b/src/dou_dag_generator.py
@@ -13,7 +13,6 @@
import os
import sys
import textwrap
-from dataclasses import asdict
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Union
from functools import reduce
@@ -35,6 +34,7 @@
from utils.date import get_trigger_date, template_ano_mes_dia_trigger_local_time
from notification.notifier import Notifier
from parsers import DAGConfig, YAMLParser
+from schemas import FetchTermsConfig
from searchers import BaseSearcher, DOUSearcher, QDSearcher, INLABSSearcher
@@ -91,7 +91,7 @@ def merge_two(dict1, dict2):
def result_as_html(specs: DAGConfig) -> bool:
"""Só utiliza resultado HTML apenas para email"""
- return specs.discord_webhook and specs.slack_webhook
+ return specs.report.discord and specs.report.slack
class DouDigestDagGenerator:
@@ -153,7 +153,7 @@ def prepare_doc_md(specs: DAGConfig, config_file: str) -> str:
Returns:
str: The DAG documentation in markdown format.
"""
- config = asdict(specs)
+ config = specs.model_dump()
# options that won't show in the "DAG Docs"
del config["description"]
del config["doc_md"]
@@ -201,7 +201,7 @@ def _get_safe_schedule(self, specs: DAGConfig, default_schedule: str) -> str:
"""
schedule = default_schedule
- id_based_minute = self._hash_dag_id(specs.dag_id, 60)
+ id_based_minute = self._hash_dag_id(specs.id, 60)
schedule_without_min = " ".join(schedule.split(" ")[1:])
schedule = f"{id_based_minute} {schedule_without_min}"
@@ -262,7 +262,7 @@ def generate_dags(self):
for filepath in files_list:
dag_specs = self.parser(filepath).parse()
- dag_id = dag_specs.dag_id
+ dag_id = dag_specs.id
globals()[dag_id] = self.create_dag(dag_specs, filepath)
def perform_searches(
@@ -385,9 +385,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
the term_list from a database
"""
# Prepare the markdown documentation
- doc_md = (
- self.prepare_doc_md(specs, config_file) if specs.doc_md else specs.doc_md
- )
+ doc_md = self.prepare_doc_md(specs, config_file) if specs.doc_md else None
# DAG parameters
default_args = {
"owner": specs.owner,
@@ -401,64 +399,78 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
schedule = self._update_schedule(specs)
dag = DAG(
- specs.dag_id,
+ specs.id,
default_args=default_args,
schedule=schedule,
description=specs.description,
doc_md=doc_md,
catchup=False,
params={"trigger_date": "2022-01-02T12:00"},
- tags=specs.dag_tags,
+ tags=specs.tags,
)
with dag:
with TaskGroup(group_id="exec_searchs") as tg_exec_searchs:
- counter = 0
- for subsearch in specs.search:
- counter += 1
- if subsearch["sql"]:
+
+ # is it a single search or a list of searchers?
+ if isinstance(specs.search, list):
+ searches = specs.search
+ else:
+ searches = [specs.search]
+
+ for counter, subsearch in enumerate(searches, 1):
+
+ # are terms to be fetched from a database?
+ terms_come_from_db: bool = isinstance(
+ subsearch.terms, FetchTermsConfig
+ ) and getattr(subsearch.terms, "from_db_select", None)
+
+ # determine the terms list
+ term_list = []
+ # is it a directly defined list of terms or is it a
+ # configuration for fetching terms from a data source?
+ if isinstance(subsearch.terms, list):
+ term_list = subsearch.terms
+ elif terms_come_from_db:
select_terms_from_db_task = PythonOperator(
task_id=f"select_terms_from_db_{counter}",
python_callable=self.select_terms_from_db,
op_kwargs={
- "sql": subsearch["sql"],
- "conn_id": subsearch["conn_id"],
+ "sql": subsearch.terms.from_db_select.sql,
+ "conn_id": subsearch.terms.from_db_select.conn_id,
},
)
- term_list = (
- "{{ ti.xcom_pull(task_ids='exec_searchs.select_terms_from_db_"
- + str(counter)
- + "') }}"
- )
+ term_list = (
+ "{{ ti.xcom_pull(task_ids='exec_searchs.select_terms_from_db_"
+ + str(counter)
+ + "') }}"
+ )
exec_search_task = PythonOperator(
task_id=f"exec_search_{counter}",
python_callable=self.perform_searches,
op_kwargs={
- "header": subsearch["header"],
- "sources": subsearch["sources"],
- "territory_id": subsearch["territory_id"],
- "term_list": subsearch["terms"] or term_list,
- "dou_sections": subsearch["dou_sections"],
- "search_date": subsearch["search_date"],
- "field": subsearch["field"],
- "is_exact_search": subsearch["is_exact_search"],
- "ignore_signature_match": subsearch[
- "ignore_signature_match"
- ],
- "force_rematch": subsearch["force_rematch"],
- "full_text": subsearch["full_text"],
- "use_summary": subsearch["use_summary"],
- "department": subsearch["department"],
+ "header": subsearch.header,
+ "sources": subsearch.sources,
+ "territory_id": subsearch.territory_id,
+ "term_list": term_list,
+ "dou_sections": subsearch.dou_sections,
+ "search_date": subsearch.date,
+ "field": subsearch.field,
+ "is_exact_search": subsearch.is_exact_search,
+ "ignore_signature_match": subsearch.ignore_signature_match,
+ "force_rematch": subsearch.force_rematch,
+ "full_text": subsearch.full_text,
+ "use_summary": subsearch.use_summary,
+ "department": subsearch.department,
"result_as_email": result_as_html(specs),
},
)
- if subsearch["sql"]:
- (
- select_terms_from_db_task >> exec_search_task
- ) # pylint: disable=pointless-statement
+ if terms_come_from_db:
+ # pylint: disable=pointless-statement
+ select_terms_from_db_task >> exec_search_task
has_matches_task = BranchPythonOperator(
task_id="has_matches",
@@ -467,12 +479,12 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
"search_result": "{{ ti.xcom_pull(task_ids="
+ str(
[
- f"exec_searchs.exec_search_{count + 1}"
- for count in range(counter)
+ f"exec_searchs.exec_search_{count}"
+ for count in range(1, len(searches) + 1)
]
)
+ ") }}",
- "skip_null": specs.skip_null,
+ "skip_null": specs.report.skip_null,
},
)
@@ -485,8 +497,8 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
"search_report": "{{ ti.xcom_pull(task_ids="
+ str(
[
- f"exec_searchs.exec_search_{count + 1}"
- for count in range(counter)
+ f"exec_searchs.exec_search_{count}"
+ for count in range(1, len(searches) + 1)
]
)
+ ") }}",
@@ -494,6 +506,7 @@ def create_dag(self, specs: DAGConfig, config_file: str) -> DAG:
},
)
+ # pylint: disable=pointless-statement
tg_exec_searchs >> has_matches_task
has_matches_task >> [send_notification_task, skip_notification_task]
diff --git a/src/notification/discord_sender.py b/src/notification/discord_sender.py
index ca8046e..c1a2072 100644
--- a/src/notification/discord_sender.py
+++ b/src/notification/discord_sender.py
@@ -1,17 +1,20 @@
-import requests
import re
+
+import requests
+
from notification.isender import ISender
+from schemas import ReportConfig
class DiscordSender(ISender):
highlight_tags = ("__", "__")
- def __init__(self, specs) -> None:
- self.webhook_url = specs.discord_webhook
- self.hide_filters = specs.hide_filters
- self.header_text = specs.header_text
- self.footer_text = specs.footer_text
- self.no_results_found_text = specs.no_results_found_text
+ def __init__(self, report_config: ReportConfig) -> None:
+ self.webhook_url = report_config.discord["webhook"]
+ self.hide_filters = report_config.hide_filters
+ self.header_text = report_config.header_text
+ self.footer_text = report_config.footer_text
+ self.no_results_found_text = report_config.no_results_found_text
def send(self, search_report: list, report_date: str = None):
"""Parse the content, and send message to Discord"""
@@ -73,4 +76,4 @@ def _remove_html_tags(self, text):
# Define a regular expression pattern to match HTML tags
clean = re.compile('<.*?>')
# Substitute HTML tags with an empty string
- return re.sub(clean, '', text)
\ No newline at end of file
+ return re.sub(clean, '', text)
diff --git a/src/notification/email_sender.py b/src/notification/email_sender.py
index 021d5f1..fd177e8 100644
--- a/src/notification/email_sender.py
+++ b/src/notification/email_sender.py
@@ -1,7 +1,10 @@
+"""Module for sending emails.
+"""
+
import os
import sys
-import textwrap
from tempfile import NamedTemporaryFile
+import textwrap
import markdown
import pandas as pd
@@ -14,12 +17,17 @@
sys.path.insert(0, parent_dir)
from notification.isender import ISender
+from schemas import ReportConfig
class EmailSender(ISender):
+ """Prepare and send e-mails with the reports."""
+
highlight_tags = ("", "")
- def __init__(self, specs) -> None:
- self.specs = specs
+
+ def __init__(self, report_config: ReportConfig) -> None:
+ self.report_config = report_config
+ self.search_report = ""
self.watermark = """
Esta pesquisa foi realizada automaticamente pelo
Ro-DOU
@@ -29,27 +37,27 @@ def __init__(self, specs) -> None:
def send(self, search_report: list, report_date: str):
"""Builds the email content, the CSV if applies, and send it"""
self.search_report = search_report
- full_subject = f"{self.specs.subject} - DOs de {report_date}"
+ full_subject = f"{self.report_config.subject} - DOs de {report_date}"
skip_notification = True
for search in self.search_report:
items = ["contains" for k, v in search["result"].items() if v]
if items:
skip_notification = False
else:
- content = self.specs.no_results_found_text
+ content = self.report_config.no_results_found_text
if skip_notification:
- if self.specs.skip_null:
+ if self.report_config.skip_null:
return "skip_notification"
else:
content = self.generate_email_content()
content += self.watermark
- if self.specs.attach_csv and skip_notification is False:
+ if self.report_config.attach_csv and skip_notification is False:
with self.get_csv_tempfile() as csv_file:
send_email(
- to=self.specs.emails,
+ to=self.report_config.emails,
subject=full_subject,
files=[csv_file.name],
html_content=content,
@@ -57,7 +65,7 @@ def send(self, search_report: list, report_date: str):
)
else:
send_email(
- to=self.specs.emails,
+ to=self.report_config.emails,
subject=full_subject,
html_content=content,
mime_charset="utf-8",
@@ -72,18 +80,18 @@ def generate_email_content(self) -> str:
parent_directory = os.path.dirname(current_directory)
file_path = os.path.join(parent_directory, "report_style.css")
- with open(file_path, "r") as f:
+ with open(file_path, "r", encoding="utf-8") as f:
blocks = [f""]
- if self.specs.header_text:
- blocks.append(self.specs.header_text)
+ if self.report_config.header_text:
+ blocks.append(self.report_config.header_text)
for search in self.search_report:
if search["header"]:
blocks.append(f"{search['header']}
")
- if not self.specs.hide_filters:
+ if not self.report_config.hide_filters:
if search["department"]:
blocks.append(
"""
Filtrando resultados somente para:
"""
@@ -95,47 +103,50 @@ def generate_email_content(self) -> str:
for group, search_results in search["result"].items():
- if not search_results:
- blocks.append(
- f"{self.specs.no_results_found_text}.
"
- )
+ if not results:
+ blocks.append(f"{self.report_config.no_results_found_text}.
")
else:
- if not self.specs.hide_filters:
- if group != "single_group":
- blocks.append("\n")
- blocks.append(f"**Grupo: {group}**")
- blocks.append("\n\n")
-
- for term, term_results in search_results.items():
+ if not self.report_config.hide_filters:
+ if group != "single_group":
blocks.append("\n")
- if not self.specs.hide_filters:
- blocks.append(f"* # Resultados para: {term}")
-
- for department, results in term_results.items():
-
- if not self.specs.hide_filters and department != 'single_department':
- blocks.append(f"**{department}**")
-
- for result in results:
- if not self.specs.hide_filters:
- sec_desc = result["section"]
- item_html = f"""
- {sec_desc}
- ### [{result['title']}]({result['href']})
- {result['abstract']}
- {result['date']}
"""
- blocks.append(
- textwrap.indent(textwrap.dedent(item_html), " " * 4)
+ blocks.append(f"**Grupo: {group}**")
+ blocks.append("\n\n")
+
+ for term, term_results in results.items():
+ blocks.append("\n")
+ if not self.report_config.hide_filters:
+ blocks.append(f"* # Resultados para: {term}")
+
+ for department, results in term_results.items():
+
+ if (
+ not self.report_config.hide_filters
+ and department != "single_department"
+ ):
+ blocks.append(f"**{department}**")
+
+ for result in results:
+ if not self.report_config.hide_filters:
+ sec_desc = result["section"]
+ item_html = f"""
+ {sec_desc}
+ ### [{result['title']}]({result['href']})
+ {result['abstract']}
+ {result['date']}
"""
+ blocks.append(
+ textwrap.indent(
+ textwrap.dedent(item_html), " " * 4
)
- else:
- item_html = f"""
- ### [{result['title']}]({result['href']})
- {result['abstract']}
"""
- blocks.append(textwrap.dedent(item_html))
+ )
+ else:
+ item_html = f"""
+ ### [{result['title']}]({result['href']})
+ {result['abstract']}
"""
+ blocks.append(textwrap.dedent(item_html))
blocks.append("---")
- if self.specs.footer_text:
- blocks.append(self.specs.footer_text)
+ if self.report_config.footer_text:
+ blocks.append(self.report_config.footer_text)
return markdown.markdown("\n".join(blocks))
@@ -188,11 +199,15 @@ def convert_report_dict_to_tuple_list(self) -> list:
for term, departments in results.items():
for department, dpt_matches in departments.items():
for match in dpt_matches:
- tuple_list.append(repack_match(header, group, term, department, match))
+ tuple_list.append(
+ repack_match(header, group, term, department, match)
+ )
return tuple_list
-def repack_match(header: str, group: str, search_term: str, department: str, match: dict) -> tuple:
+def repack_match(
+ header: str, group: str, search_term: str, department: str, match: dict
+) -> tuple:
return (
header,
group,
diff --git a/src/notification/notifier.py b/src/notification/notifier.py
index 84f8264..604801d 100644
--- a/src/notification/notifier.py
+++ b/src/notification/notifier.py
@@ -26,15 +26,21 @@ class Notifier:
def __init__(self, specs: DAGConfig) -> None:
self.senders = []
- if specs.emails:
- self.senders.append(EmailSender(specs))
- if specs.discord_webhook:
- self.senders.append(DiscordSender(specs))
- if specs.slack_webhook:
- self.senders.append(SlackSender(specs))
+ if specs.report.emails:
+ self.senders.append(EmailSender(specs.report))
+ if specs.report.discord:
+ self.senders.append(DiscordSender(specs.report))
+ if specs.report.slack:
+ self.senders.append(SlackSender(specs.report))
def send_notification(self, search_report: str, report_date: str):
+ """Sends the notification to the specified email, Discord or Slack
+
+ Args:
+ search_report (str): The report to be sent
+ report_date (str): The date of the report
+ """
# Convert to data structure after it's retrieved from xcom
search_report = ast.literal_eval(search_report)
diff --git a/src/notification/slack_sender.py b/src/notification/slack_sender.py
index 3158f49..8b636d5 100644
--- a/src/notification/slack_sender.py
+++ b/src/notification/slack_sender.py
@@ -1,20 +1,27 @@
+"""Send reports to Slack.
+"""
+
from datetime import datetime
+import re
import requests
-import re
from notification.isender import ISender
+from schemas import ReportConfig
+
class SlackSender(ISender):
+ """Prepare a report and send it to Slack.
+ """
highlight_tags = ("*", "*")
- def __init__(self, specs) -> None:
- self.webhook_url = specs.slack_webhook
+ def __init__(self, report_config: ReportConfig) -> None:
+ self.webhook_url = report_config.slack["webhook"]
self.blocks = []
- self.hide_filters = specs.hide_filters
- self.header_text = specs.header_text
- self.footer_text = specs.footer_text
- self.no_results_found_text = specs.no_results_found_text
+ self.hide_filters = report_config.hide_filters
+ self.header_text = report_config.header_text
+ self.footer_text = report_config.footer_text
+ self.no_results_found_text = report_config.no_results_found_text
def send(self, search_report: list, report_date: str = None):
"""Parse the content, and send message to Slack"""
@@ -42,7 +49,7 @@ def send(self, search_report: list, report_date: str = None):
for department, results in term_results.items():
if not self.hide_filters and department != 'single_department':
- self._add_header(f"{department}")
+ self._add_header(f"{department}")
for result in results:
self._add_block(result)
@@ -117,8 +124,9 @@ def _format_date(date_str: str) -> str:
_from, _to = WEEKDAYS_EN_TO_PT[date.weekday()]
return date.strftime("%a %d/%m").replace(_from, _to)
+
def _remove_html_tags(text):
# Define a regular expression pattern to match HTML tags
- clean = re.compile('<.*?>')
+ clean = re.compile("<.*?>")
# Substitute HTML tags with an empty string
- return re.sub(clean, '', text)
\ No newline at end of file
+ return re.sub(clean, "", text)
diff --git a/src/parsers.py b/src/parsers.py
index 269f2d5..f895c23 100644
--- a/src/parsers.py
+++ b/src/parsers.py
@@ -1,64 +1,18 @@
"""Abstract and concrete classes to parse DAG configuration from a file."""
-import ast
-import os
+# from dataclasses import dataclass
+import json
import textwrap
-from abc import ABC, abstractmethod
-from dataclasses import dataclass
-from typing import List, Set, Tuple, Union
-
+from typing import List, Tuple
import yaml
+
from airflow import Dataset
from airflow.models import Variable
-@dataclass
-class SearchConfig:
- header: str
- sources: List[str]
- territory_id: int
- dou_sections: List[str]
- field: str
- search_date: str
- is_exact_search: bool
- ignore_signature_match: bool
- force_rematch: bool
- full_text: bool
- use_summary: bool
- terms: List[str]
- sql: str
- conn_id: str
- department: List[str]
-
-
-@dataclass
-class DAGConfig:
- dag_id: str
- search: List[SearchConfig]
- emails: List[str]
- subject: str
- attach_csv: bool
- discord_webhook: str
- slack_webhook: str
- schedule: str
- dataset: str
- description: str
- skip_null: bool
- doc_md: str
- dag_tags: Set[str]
- owner: str
- hide_filters: bool
- header_text: str
- footer_text: str
- no_results_found_text: str
-
-
-class FileParser(ABC):
- """Abstract class to build file parsers with DAG configuration."""
-
- @abstractmethod
- def parse(self):
- pass
-class YAMLParser(FileParser):
+from schemas import RoDouConfig, DAGConfig
+
+
+class YAMLParser:
"""Parses YAML file and get the DAG parameters.
It guarantees that mandatory fields are in place and are properly
@@ -68,16 +22,20 @@ class YAMLParser(FileParser):
def __init__(self, filepath: str):
self.filepath = filepath
- def parse(self) -> DAGConfig:
- return self._parse_yaml()
+ def read(self) -> dict:
+ """Reads the contents of the YAML file."""
+ with open(self.filepath, "r", encoding="utf-8") as file:
+ dag_config_dict = yaml.safe_load(file)
+ return dag_config_dict
- def _parse_yaml(self) -> DAGConfig:
+ def parse(self) -> DAGConfig:
"""Processes the config file in order to instantiate the DAG in
Airflow.
"""
- with open(self.filepath, "r") as file:
- dag_config_dict = yaml.safe_load(file)
+ config = RoDouConfig(**self.read())
+ return config.dag
+ # TODO: remove old parser code
dag = self._try_get(dag_config_dict, "dag")
dag_id = self._try_get(dag, "id")
description = self._try_get(dag, "description")
@@ -169,7 +127,7 @@ def _get_terms_params(self, search) -> Tuple[List[str], str, str]:
if "from_airflow_variable" in terms:
var_value = Variable.get(terms.get("from_airflow_variable"))
try:
- terms = ast.literal_eval(var_value)
+ terms = json.loads(var_value)
except (ValueError, SyntaxError):
terms = var_value.splitlines()
elif "from_db_select" in terms:
diff --git a/src/schemas.py b/src/schemas.py
new file mode 100644
index 0000000..196bdb5
--- /dev/null
+++ b/src/schemas.py
@@ -0,0 +1,229 @@
+"""
+This module defines the Pydantic models for validating the structure of
+the YAML files used in the application.
+
+The main classes are:
+
+- `SearchTerms`: search terms in the YAML file.
+- `Search`: search configuration in the YAML file.
+- `Report`: report configuration in the YAML file.
+- `DAG`: DAG defined in the YAML file.
+- `Config`: overall configuration in the YAML file.
+
+These models are used to validate the YAML files using the Pydantic
+library.
+"""
+
+import textwrap
+from typing import List, Optional, Set, Union
+from pydantic import AnyHttpUrl, BaseModel, EmailStr, Field
+from pydantic import field_validator
+
+
+class DBSelect(BaseModel):
+ """Represents the structure of the 'from_db_select' field in the YAML file."""
+
+ sql: str = Field(description="SQL query to fetch the search terms")
+ conn_id: str = Field(description="Airflow connection ID to use for the SQL query")
+
+
+class FetchTermsConfig(BaseModel):
+ """Represents configuration information for fetching search terms from
+ a data source."""
+
+ from_airflow_variable: Optional[str] = Field(
+ default=None,
+ description="Variável do Airflow a ser usada como termos de pesquisa",
+ )
+ from_db_select: Optional[DBSelect] = Field(
+ default=None,
+ description="Consulta SQL para buscar os termos de pesquisa em um "
+ "banco de dados",
+ )
+
+
+class SearchField(BaseModel):
+ """Represents the field for search in the YAML file."""
+
+ description: str
+ value: str
+
+
+class SearchConfig(BaseModel):
+ """Represents the search configuration in the YAML file."""
+
+ header: Optional[str] = Field(
+ default=None, description="Cabeçalho da consulta de pesquisa"
+ )
+ sources: Optional[List[str]] = Field(
+ default=["DOU"],
+ description="Lista de fontes de dados para pesquisar (Querido Diário [QD], "
+ "Diário Oficial da União [DOU], INLABS). Default: DOU.",
+ )
+ territory_id: Optional[int] = Field(
+ default=None,
+ description="ID do território no Querido Diário para filtragem "
+ "baseada em localização",
+ )
+ date: Optional[str] = Field(
+ default="DIA",
+ description="Intervalo de data para busca. Valores: DIA, SEMANA, "
+ "MES, ANO. Default: DIA",
+ )
+ dou_sections: Optional[List[str]] = Field(
+ default=["TODOS"],
+ description=textwrap.dedent(
+ """
+ Seção do Diário Oficial a procurar:
+
+ - SECAO_1
+ - SECAO_2
+ - SECAO_3
+ - EDICAO_EXTRA
+ - EDICAO_EXTRA_1A
+ - EDICAO_EXTRA_1B
+ - EDICAO_EXTRA_1D
+ - EDICAO_EXTRA_2A
+ - EDICAO_EXTRA_2B
+ - EDICAO_EXTRA_2D
+ - EDICAO_EXTRA_3A
+ - EDICAO_EXTRA_3B
+ - EDICAO_EXTRA_3D
+ - EDICAO_SUPLEMENTAR
+ - TODOS
+
+ Default: TODOS
+ """
+ ),
+ )
+ department: Optional[List[str]] = Field(
+ default=None, description="Lista de departamentos para filtrar a pesquisa"
+ )
+ terms: Union[List[str], FetchTermsConfig] = Field(
+ description="Lista de termos de pesquisa ou uma forma de buscá-los"
+ )
+ field: Optional[str] = Field(
+ default="TUDO",
+ description="Campos dos quais os termos devem ser pesquisados. "
+ "Valores: TUDO, TITULO, CONTEUDO. Default: TUDO",
+ )
+ is_exact_search: Optional[bool] = Field(
+ default=True,
+ description="Busca somente o termo exato. Valores: True ou False. "
+ "Default: True.",
+ )
+ ignore_signature_match: Optional[bool] = Field(
+ default=False,
+ description="Busca somente o termo exato. Valores: True ou False. "
+ "Default: True.",
+ )
+ force_rematch: Optional[bool] = Field(
+ default=False,
+ description="Indica que a busca deve ser forçada, mesmo que já "
+ "tenha sido feita anteriormente. Valores: True ou False. "
+ "Default: False.",
+ )
+ full_text: Optional[bool] = Field(
+ default=False,
+ description="Define se no relatório será exibido o texto completo, "
+ "ao invés de um resumo. Valores: True ou False. Default: False. "
+ "(Funcionalidade disponível apenas no INLABS)",
+ )
+ use_summary: Optional[bool] = Field(
+ default=False,
+ description="Define se no relatório será exibido a ementa, se existir. "
+ "Valores: True ou False. Default: False. "
+ "(Funcionalidade disponível apenas no INLABS)",
+ )
+
+
+class ReportConfig(BaseModel):
+ """Represents the report configuration in the YAML file."""
+
+ slack: Optional[dict] = Field(
+ default=None, description="Configuração do webhook do Slack para relatórios"
+ )
+ discord: Optional[dict] = Field(
+ default=None, description="Configuração do webhook do Discord para relatórios"
+ )
+ emails: Optional[List[EmailStr]] = Field(
+ default=None, description="Lista de endereços de e-mail para enviar o relatório"
+ )
+ attach_csv: Optional[bool] = Field(
+ default=False,
+ description="Se deve anexar um arquivo CSV com os resultados da pesquisa."
+ "Default: False.",
+ )
+ subject: Optional[str] = Field(
+ default=None, description="Assunto do relatório por e-mail"
+ )
+ skip_null: Optional[bool] = Field(
+ default=True,
+ description="Se deve pular a notificação de resultados nulos/vazios. "
+ "Default: True.",
+ )
+ hide_filters: Optional[bool] = Field(
+ default=False,
+ description="Se deve ocultar os filtros aplicados no relatório."
+ "Default: False.",
+ )
+ header_text: Optional[str] = Field(
+ default=None, description="Texto a ser incluído no cabeçalho do relatório"
+ )
+ footer_text: Optional[str] = Field(
+ default=None, description="Texto a ser incluído no rodapé do relatório"
+ )
+ no_results_found_text: Optional[str] = Field(
+ default="Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ description="Texto a ser exibido quando não há resultados",
+ )
+
+
+class DAGConfig(BaseModel):
+ """Represents the DAG configuration in the YAML file."""
+
+ id: str = Field(description="Nome único da DAG")
+ description: str = Field(description="Descrição da DAG")
+ tags: Optional[Set[str]] = Field(
+ default={"dou", "generated_dag"},
+ description="Conjunto de tags para filtragem da DAG no Airflow",
+ )
+ owner: Optional[List[str]] = Field(
+ default=[], description="Lista de owners para filtragem da DAG no Airflow"
+ )
+ schedule: Optional[str] = Field(default=None, description="Expressão cron")
+ dataset: Optional[str] = Field(default=None, description="Nome do Dataset")
+ search: Union[List[SearchConfig], SearchConfig] = Field(
+ description="Seção para definição da busca no Diário"
+ )
+ doc_md: Optional[str] = Field(default=None, description="description")
+ report: ReportConfig = Field(
+ description="Aceita: `slack`, `discord`, `emails`, `attach_csv`, "
+ "`subject`, `skip_null`"
+ )
+
+ @field_validator("search")
+ @staticmethod
+ def cast_to_list(
+ search_param: Union[List[SearchConfig], SearchConfig]
+ ) -> List[SearchConfig]:
+ """Cast the value of "search" parameter to always be a list.
+ If the yaml configuration file does not use a list, convert to
+ a list with a single search.
+ """
+ if not isinstance(search_param, list):
+ return [search_param]
+ return search_param
+
+ @field_validator("tags")
+ @staticmethod
+ def add_default_tags(tags_param: Optional[Set[str]]) -> Set[str]:
+ """Add default tags to the list of tags."""
+ tags_param.update({"dou", "generated_dag"})
+ return tags_param
+
+
+class RoDouConfig(BaseModel):
+ """Represents the overall configuration in the YAML file."""
+
+ dag: DAGConfig = Field(description="Instanciação da DAG")
diff --git a/tests/discord_sender_test.py b/tests/discord_sender_test.py
index 071c3ac..1d3c18a 100644
--- a/tests/discord_sender_test.py
+++ b/tests/discord_sender_test.py
@@ -12,7 +12,7 @@ def mocked_specs():
Specs = namedtuple(
"Specs",
[
- "discord_webhook",
+ "discord",
"hide_filters",
"header_text",
"footer_text",
@@ -20,7 +20,7 @@ def mocked_specs():
],
)
return Specs(
- WEBHOOK,
+ {"webhook": WEBHOOK},
False,
None,
None,
diff --git a/tests/parsers_test.py b/tests/parsers_test.py
index 442f1ed..ef2bb4e 100644
--- a/tests/parsers_test.py
+++ b/tests/parsers_test.py
@@ -5,7 +5,7 @@
import sys
import inspect
import textwrap
-import yaml
+
import pytest
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
@@ -14,14 +14,14 @@
from dou_dag_generator import DouDigestDagGenerator, YAMLParser, DAGConfig
-
@pytest.mark.parametrize(
"filepath, result_tuple",
[
(
"basic_example.yaml",
{
- "dag_id": "basic_example",
+ "id": "basic_example",
+ "description": "DAG de teste",
"search": [
{
"terms": [
@@ -39,34 +39,43 @@
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": None,
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": False,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": None,
- "dataset": None,
- "description": "DAG de teste",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag"},
- "owner": "",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": False,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "schedule": None,
+ "dataset": None,
+ "description": "DAG de teste",
+ "skip_null": True,
+ "doc_md": None,
+ "tags": {"dou", "generated_dag"},
+ "owner": [],
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados "
+ "foi encontrado nesta consulta",
+ },
},
),
(
"all_parameters_example.yaml",
{
- "dag_id": "all_parameters_example",
+ "id": "all_parameters_example",
+ "schedule": "0 8 * * MON-FRI",
+ "dataset": None,
+ "description": "DAG exemplo utilizando todos os demais parâmetros.",
+ "doc_md": None,
+ "tags": {"dou", "generated_dag", "projeto_a", "departamento_x"},
+ "owner": ["pessoa 1", "pessoa 2"],
"search": [
{
"terms": [
@@ -74,13 +83,13 @@
"governo aberto",
"lei de acesso à informação",
],
- "header": None,
+ "header": "Pesquisa no DOU",
"sources": ["DOU"],
"sql": None,
"conn_id": None,
"territory_id": None,
"dou_sections": ["SECAO_1", "EDICAO_SUPLEMENTAR"],
- "search_date": "MES",
+ "date": "MES",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": True,
@@ -90,76 +99,85 @@
"department": None,
}
],
- "emails": ["dest1@economia.gov.br", "dest2@economia.gov.br"],
- "subject": "Assunto do Email",
- "attach_csv": True,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": "0 8 * * MON-FRI",
- "dataset": None,
- "description": "DAG exemplo utilizando todos os demais parâmetros.",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag", "projeto_a", "departamento_x"},
- "owner": "pessoa 1, pessoa 2",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "skip_null": True,
+ "emails": ["dest1@economia.gov.br", "dest2@economia.gov.br"],
+ "subject": "Assunto do Email",
+ "attach_csv": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi "
+ "encontrado nesta consulta",
+ },
},
),
(
"terms_from_db_example.yaml",
{
- "dag_id": "terms_from_db_example",
+ "id": "terms_from_db_example",
+ "description": "DAG de teste",
+ "doc_md": None,
+ "tags": {"dou", "generated_dag"},
+ "owner": [],
+ "schedule": None,
+ "dataset": None,
"search": [
{
- "terms": [],
+ "terms": {
+ "from_airflow_variable": None,
+ "from_db_select": {
+ "sql": (
+ "SELECT 'cloroquina' as TERMO, 'Ações inefetivas' as GRUPO "
+ "UNION SELECT 'ivermectina' as TERMO, 'Ações inefetivas' as GRUPO "
+ "UNION SELECT 'vacina contra covid' as TERMO, 'Ações efetivas' as GRUPO "
+ "UNION SELECT 'higienização das mãos' as TERMO, 'Ações efetivas' as GRUPO "
+ "UNION SELECT 'uso de máscara' as TERMO, 'Ações efetivas' as GRUPO "
+ "UNION SELECT 'distanciamento social' as TERMO, 'Ações efetivas' as GRUPO\n"
+ ),
+ "conn_id": "example_database_conn",
+ }
+ },
"header": None,
"sources": ["DOU"],
- "sql": (
- "SELECT 'cloroquina' as TERMO, 'Ações inefetivas' as GRUPO "
- "UNION SELECT 'ivermectina' as TERMO, 'Ações inefetivas' as GRUPO "
- "UNION SELECT 'vacina contra covid' as TERMO, 'Ações efetivas' as GRUPO "
- "UNION SELECT 'higienização das mãos' as TERMO, 'Ações efetivas' as GRUPO "
- "UNION SELECT 'uso de máscara' as TERMO, 'Ações efetivas' as GRUPO "
- "UNION SELECT 'distanciamento social' as TERMO, 'Ações efetivas' as GRUPO\n"
- ),
- "conn_id": "example_database_conn",
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "MES",
+ "date": "MES",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": None,
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "[String] com caracteres especiais deve estar entre aspas",
- "attach_csv": True,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": None,
- "dataset": None,
- "description": "DAG de teste",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag"},
- "owner": "",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "[String] com caracteres especiais deve estar entre aspas",
+ "attach_csv": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "skip_null": True,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"basic_example_skip_null.yaml",
{
- "dag_id": "basic_example_skip_null",
+ "id": "basic_example_skip_null",
+ "schedule": None,
+ "dataset": None,
+ "description": "DAG de teste",
+ "doc_md": None,
+ "tags": {"dou", "generated_dag"},
+ "owner": [],
"search": [
{
"terms": ["cimentodaaroeira"],
@@ -169,38 +187,48 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": None,
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": False,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": None,
- "dataset": None,
- "description": "DAG de teste",
- "skip_null": False,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag"},
- "owner": "",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": False,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "skip_null": False,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"markdown_docs_example.yaml",
{
- "dag_id": "markdown_docs_example",
+ "id": "markdown_docs_example",
+ "schedule": None,
+ "dataset": None,
+ "description": "DAG com documentação em markdown",
+ "doc_md": textwrap.dedent(
+ """
+ ## Ola!
+ Esta é uma DAG de exemplo com documentação em markdown. Esta descrição é opcional e pode ser definida no parâmetro `doc_md`.
+
+ * Ah, aqui você também pode usar *markdown* para
+ * escrever listas, por exemplo,
+ * ou colocar [links](graph)!"""
+ ).strip(),
+ "tags": {"dou", "generated_dag"},
+ "owner": [],
"search": [
{
"terms": [
@@ -214,46 +242,40 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": None,
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": False,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": None,
- "dataset": None,
- "description": "DAG com documentação em markdown",
- "skip_null": True,
- "doc_md": textwrap.dedent(
- """
- ## Ola!
- Esta é uma DAG de exemplo com documentação em markdown. Esta descrição é opcional e pode ser definida no parâmetro `doc_md`.
-
- * Ah, aqui você também pode usar *markdown* para
- * escrever listas, por exemplo,
- * ou colocar [links](graph)!"""
- ).strip(),
- "dag_tags": {"dou", "generated_dag"},
- "owner": "",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": False,
+ "skip_null": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"department_example.yaml",
{
- "dag_id": "department_example",
+ "id": "department_example",
+ "schedule": None,
+ "dataset": None,
+ "description": "DAG de teste (filtro por departamento)",
+ "doc_md": None,
+ "tags": {"dou", "generated_dag"},
+ "owner": [],
"search": [
{
"terms": ["dados abertos"],
@@ -263,41 +285,43 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": [
"Ministério da Gestão e da Inovação em Serviços Públicos",
"Ministério da Defesa",
],
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": False,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": None,
- "dataset": None,
- "description": "DAG de teste (filtro por departamento)",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag"},
- "owner": "",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": False,
+ "skip_null": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"inlabs_example.yaml",
{
- "dag_id": "inlabs_example",
+ "id": "inlabs_example",
+ "schedule": "0 8 * * MON-FRI",
+ "dataset": "inlabs",
+ "description": "DAG de teste",
+ "doc_md": None,
+ "tags": {"dou", "generated_dag", "inlabs"},
+ "owner": ["cdata"],
"search": [
{
"terms": ["tecnologia", "informação"],
@@ -307,38 +331,41 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
+ "force_rematch": False,
+ "full_text": False,
"use_summary": True,
"department": None,
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": True,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": "0 8 * * MON-FRI",
- "dataset": "inlabs",
- "description": "DAG de teste",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag", "inlabs"},
- "owner": "cdata",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": True,
+ "skip_null": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"inlabs_advanced_search_example.yaml",
{
- "dag_id": "inlabs_advanced_search_example",
+ "id": "inlabs_advanced_search_example",
+ "schedule": None,
+ "dataset": "inlabs",
+ "description": "DAG de teste",
+ "skip_null": True,
+ "doc_md": None,
+ "tags": {"dou", "generated_dag", "inlabs"},
+ "owner": ["cdata"],
"search": [
{
"terms": [
@@ -351,38 +378,39 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": None,
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": True,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": None,
- "dataset": "inlabs",
- "description": "DAG de teste",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag", "inlabs"},
- "owner": "cdata",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"multiple_searchs_example.yaml",
{
- "dag_id": "multiple_searchs_example",
+ "id": "multiple_searchs_example",
+ "schedule": "0 8 * * MON-FRI",
+ "dataset": None,
+ "description": "DAG de teste com múltiplas buscas",
+ "doc_md": None,
+ "tags": {"dou", "generated_dag", "inlabs"},
+ "owner": [],
"search": [
{
"terms": [
@@ -396,13 +424,13 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": True,
"force_rematch": True,
- "full_text": None,
- "use_summary": None,
+ "full_text": False,
+ "use_summary": False,
"department": None,
},
{
@@ -417,38 +445,40 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": True,
"force_rematch": True,
- "full_text": None,
- "use_summary": None,
+ "full_text": False,
+ "use_summary": False,
"department": None,
},
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": False,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": "0 8 * * MON-FRI",
- "dataset": None,
- "description": "DAG de teste com múltiplas buscas",
- "skip_null": False,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag", "inlabs"},
- "owner": "",
- "hide_filters": False,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": False,
+ "skip_null": False,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": False,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"hide_filters_example.yaml",
{
- "dag_id": "hide_filters_example",
+ "id": "hide_filters_example",
+ "schedule": "0 8 * * MON-FRI",
+ "dataset": None,
+ "description": "DAG de teste",
+ "doc_md": None,
+ "tags": {"dou", "inlabs", "generated_dag"},
+ "owner": [],
"search": [
{
"terms": ["tecnologia", "informação"],
@@ -458,41 +488,43 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": [
"Ministério da Gestão e da Inovação em Serviços Públicos",
"Ministério da Defesa",
],
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": True,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": "0 8 * * MON-FRI",
- "dataset": None,
- "description": "DAG de teste",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "inlabs", "generated_dag"},
- "owner": "",
- "hide_filters": True,
- "header_text": None,
- "footer_text": None,
- "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": True,
+ "skip_null": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": True,
+ "header_text": None,
+ "footer_text": None,
+ "no_results_found_text": "Nenhum dos termos pesquisados foi encontrado nesta consulta",
+ },
},
),
(
"header_and_footer_example.yaml",
{
- "dag_id": "header_and_footer_example",
+ "id": "header_and_footer_example",
+ "schedule": "0 8 * * MON-FRI",
+ "dataset": None,
+ "description": "DAG de teste",
+ "doc_md": None,
+ "tags": {"dou", "generated_dag"},
+ "owner": [],
"search": [
{
"terms": ["tecnologia", "informação"],
@@ -502,41 +534,36 @@
"conn_id": None,
"territory_id": None,
"dou_sections": ["TODOS"],
- "search_date": "DIA",
+ "date": "DIA",
"field": "TUDO",
"is_exact_search": True,
"ignore_signature_match": False,
- "force_rematch": None,
- "full_text": None,
- "use_summary": None,
+ "force_rematch": False,
+ "full_text": False,
+ "use_summary": False,
"department": None,
}
],
- "emails": ["destination@economia.gov.br"],
- "subject": "Teste do Ro-dou",
- "attach_csv": False,
- "discord_webhook": None,
- "slack_webhook": None,
- "schedule": "0 8 * * MON-FRI",
- "dataset": None,
- "description": "DAG de teste",
- "skip_null": True,
- "doc_md": None,
- "dag_tags": {"dou", "generated_dag"},
- "owner": "",
- "hide_filters": False,
- "header_text": "Greetings
",
- "footer_text": "Best Regards
",
- "no_results_found_text": "No results found",
+ "report": {
+ "emails": ["destination@economia.gov.br"],
+ "subject": "Teste do Ro-dou",
+ "attach_csv": False,
+ "skip_null": True,
+ "discord_webhook": None,
+ "slack_webhook": None,
+ "hide_filters": False,
+ "header_text": "Greetings
",
+ "footer_text": "Best Regards
",
+ "no_results_found_text": "No results found",
+ },
},
),
],
)
-
def test_parse(filepath, result_tuple):
filepath = os.path.join(
DouDigestDagGenerator().YAMLS_DIR, "examples_and_tests", filepath
)
parsed = YAMLParser(filepath=filepath).parse()
- assert parsed == DAGConfig(**result_tuple)
+ assert parsed.model_dump() == DAGConfig(**result_tuple).model_dump()
diff --git a/tests/test_validate_yaml_schemas.py b/tests/test_validate_yaml_schemas.py
index 4cd91fc..fedfadd 100644
--- a/tests/test_validate_yaml_schemas.py
+++ b/tests/test_validate_yaml_schemas.py
@@ -1,36 +1,20 @@
-import json
-import jsonschema
-import pytest
+"""Test validation of yaml files according to the defined schemas.
+"""
+
import glob
+import os
+import sys
+
+from pydantic import ValidationError
+import pytest
import yaml
-import requests
-from urllib.parse import urlparse
+# add module path so we can import from other modules
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__)))
+from schemas import RoDouConfig
YAMLS_DIR = "../dags/ro_dou/dag_confs"
-SCHEMA_FILEPATH = "../schemas/ro-dou.json"
-# or
-# SCHEMA_FILEPATH = "https://raw.githubusercontent.com/gestaogovbr/Ro-dou/main/schemas/ro-dou.json"
-
-
-def get_schema(filepath):
- def _is_valid_url(url):
- try:
- result = urlparse(url)
- return all([result.scheme, result.netloc])
- except ValueError:
- return False
-
- if _is_valid_url(filepath):
- response = requests.get(filepath)
- response.raise_for_status()
- return json.loads(response.text)
- else:
- with open(filepath) as f:
- return json.load(f)
-
-SCHEMA = get_schema(SCHEMA_FILEPATH)
@pytest.mark.parametrize(
"data_file",
@@ -40,8 +24,11 @@ def _is_valid_url(url):
+ glob.glob(f"{YAMLS_DIR}/**/*.yaml", recursive=True)
],
)
-def test_json_schema_validation(data_file):
+def test_pydantic_validation(data_file):
with open(data_file) as data_fp:
data = yaml.safe_load(data_fp)
- jsonschema.validate(instance=data, schema=SCHEMA)
+ try:
+ RoDouConfig(**data)
+ except ValidationError as e:
+ pytest.fail(f"YAML file {data_file} is not valid:\n{e}")