Skip to content

Commit

Permalink
update inlabs_hook
Browse files Browse the repository at this point in the history
  • Loading branch information
vitorbellini committed Apr 2, 2024
1 parent 373d19c commit d73b733
Showing 1 changed file with 40 additions and 16 deletions.
56 changes: 40 additions & 16 deletions src/hooks/inlabs_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
import os
import logging
import re
from datetime import datetime
from datetime import datetime, timedelta
import time
import json
from collections.abc import Iterator
import unicodedata
import math
import requests
from requests import Response
import html2text

from airflow.hooks.base import BaseHook
Expand Down Expand Up @@ -48,7 +49,6 @@ def search_text(self, search_terms: dict, ignore_signature_match: bool) -> dict:
dict: A dictionary of processed search results.
"""

headers = {"Content-Type": "application/json"}
response = []
text_terms = search_terms["texto"]

Expand All @@ -64,16 +64,20 @@ def search_text(self, search_terms: dict, ignore_signature_match: bool) -> dict:
retries = 0
while retries < self.MAX_RETRIES:
try:
r = requests.post(
self.DOU_API_URL,
headers=headers,
data=json.dumps(search_terms),
timeout=45,
r = self._request_api_data(search_terms)
r.raise_for_status()
response.extend(r.json())

r = self._request_api_data(
self._adapt_search_terms_to_extra(search_terms)
)
r.raise_for_status()
response.extend(r.json())
break
except (requests.exceptions.ReadTimeout, requests.exceptions.HTTPError) as e:
except (
requests.exceptions.ReadTimeout,
requests.exceptions.HTTPError,
) as e:
retries += 1
logging.info(
"Timeout occurred, retrying %s of %s...",
Expand All @@ -95,19 +99,38 @@ def search_text(self, search_terms: dict, ignore_signature_match: bool) -> dict:
@staticmethod
def _iterate_in_chunks(lst: list, chunk_size: int) -> Iterator[list]:
"""Splits a list into chunks of a specified size.
Args:
lst (list): The list to be chunked.
chunk_size (int): The size of each chunk.
Yields:
Iterator[list]: A generator yielding chunks of
the original list.
"""

for i in range(0, len(lst), chunk_size):
yield lst[i : i + chunk_size]

def _request_api_data(self, payload: dict) -> Response:
headers = {"Content-Type": "application/json"}
post_result = requests.post(
self.DOU_API_URL,
headers=headers,
data=json.dumps(payload),
timeout=45,
)

print(post_result.json())

return post_result

@staticmethod
def _adapt_search_terms_to_extra(payload: dict) -> dict:
payload["pub_date"] = [
(datetime.strptime(date, "%Y-%m-%d") - timedelta(days=1)).strftime(
"%Y-%m-%d"
)
for date in payload["pub_date"]
]
payload["pub_name"] = [
s if s.endswith("E") else s + "E" for s in payload["pub_name"]
]

return payload

class TextDictHandler:
"""Handles the transformation and organization of text search
results from the DOU API.
Expand Down Expand Up @@ -149,6 +172,7 @@ def transform_search_results(
"hierarchyList": None,
}

#XXX check here
matches = self._find_matches(item["abstract"], text_terms)

if matches and item["title"]:
Expand Down

0 comments on commit d73b733

Please sign in to comment.