Skip to content

Commit

Permalink
Merge pull request #30 from rafaelpezzuto/improve-dl-geo-robot
Browse files Browse the repository at this point in the history
Melhora coleta de insumos counter_robots e geolocation mmdb
  • Loading branch information
rafaelpezzuto authored Jun 3, 2022
2 parents 0632376 + 1d65488 commit 8292463
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 64 deletions.
4 changes: 2 additions & 2 deletions app/lib/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ def open_bz2(file_path, mode):
return bz2.open(file_path, mode)


def extract_gzip(file_path):
def extract_gzip(file_path, path_output):
with gzip.open(file_path, 'rb') as fin:
with open(file_path.replace('.gz', ''), 'wb') as fout:
with open(path_output, 'wb') as fout:
shutil.copyfileobj(fin, fout)


Expand Down
63 changes: 31 additions & 32 deletions app/proc/download_geomap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,45 +4,44 @@
import requests
import os

from app.lib.file import (
check_dir,
extract_gzip,
)
from app.lib.file import extract_gzip


DEFAULT_URL = 'https://download.db-ip.com/free/dbip-city-lite-{0}-{1}.mmdb.gz'
MMDB_DEFAULT_URL_FORMAT = 'https://download.db-ip.com/free/dbip-city-lite-{0}-{1}.mmdb.gz'

LOGGING_LEVEL = os.environ.get(
'GEOIP_LOGGING_LEVEL',
'INFO'
)

OUTPUT_FILENAME = os.environ.get(
'GEOIP_OUTPUT_FILENAME',
'data/map.mmdb.gz'
)

class FileMMDBWasNotDownloadError(Exception):
...


def _download(url, output, chunk_size=128):
def download_mmdb(url, path_output, chunk_size=128):
r = requests.get(url, stream=True)
with open(output, 'wb') as fd:

try:
r.raise_for_status()
except requests.exceptions.HTTPError:
raise FileMMDBWasNotDownloadError('Arquivo de geolocalizações não foi coletado')

with open(path_output,'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
return output

return True

def download_mmdb_from_date(year, month, output):

def _generate_mmdb_url_from_date(default_mmdb_route, year, month):
if year == '' or month == '':
today = datetime.date.today()

year = today.year
month = today.month

return _download(DEFAULT_URL.format(year, month), output)


def download_mmdb_from_url(url, output):
return _download(url, output)
return default_mmdb_route.format(year, month)


def main():
Expand All @@ -66,10 +65,9 @@ def main():
)

parser.add_argument(
'-o',
'--output',
default=OUTPUT_FILENAME,
help='Arquivo do mapa de geolocalizações'
'--path_output',
required=True,
help='Caminho do arquivo de mapa de geolocalizações'
)

params = parser.parse_args()
Expand All @@ -80,17 +78,18 @@ def main():
datefmt='%d/%b/%Y %H:%M:%S'
)

check_dir(params.output)
output = ''

if params.url:
logging.info('Coletando dados...')
output = download_mmdb_from_url(params.url, params.output)
mmdb_url = params.url

elif params.year and params.month:
logging.info('Coletando dados a partir de data e ano: (%s, %s)' % (params.year, params.month))
output = download_mmdb_from_date(params.year, params.month, params.output)
mmdb_url = _generate_mmdb_url_from_date(MMDB_DEFAULT_URL_FORMAT, params.year, params.month)

try:
logging.info('Coletando arquivo MMDB de %s' % mmdb_url)
download_mmdb(mmdb_url, params.path_output)
except FileMMDBWasNotDownloadError:
logging.warning('Arquivo MMDB não está disponível em %s' % mmdb_url)
exit(1)

if output:
logging.info('Extraindo dados...')
extract_gzip(output)
logging.info('Extraindo dados de %s' % params.path_output)
extract_gzip(params.path_output, params.path_output.replace('mmdb.gz', 'mmdb'))
65 changes: 36 additions & 29 deletions app/proc/download_robots.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import requests

from time import sleep
from app.lib.file import check_dir


LOGGING_LEVEL = os.environ.get(
Expand All @@ -18,16 +17,20 @@
5
))

OUTPUT_FILENAME = os.environ.get(
'COUNTER_ROBOTS_OUTPUT_FILENAME',
'data/counter-robots.txt'
)

COUNTER_ROBOTS_URL = os.environ.get(
'COUNTER_ROBOTS_URL',
'https://raw.githubusercontent.com/atmire/COUNTER-Robots/master/COUNTER_Robots_list.json'
)

SLEEP_TIME = int(os.environ.get(
'COUNTER_ROBOTS_URL_SLEEP_TIME',
30
))


class FileRobotsWasNotDownloadError(Exception):
...


def _extract_patterns(robots_json):
for i in robots_json:
Expand Down Expand Up @@ -66,20 +69,23 @@ def get_robots(url):
},
]
"""
logging.info('Coletando dados...')
try:
for t in range(MAX_RETRIES):
logging.debug(f'Tentativa {t + 1}')
response = requests.get(url)

if response.status_code != 200:
logging.warning('Não foi possível obter a lista de robôs')
else:
return response.json()

sleep(30)
except Exception as e:
logging.error(e)
for t in range(1, MAX_RETRIES + 1):
response = requests.get(url)

try:
response.raise_for_status()
except requests.exceptions.HTTPError:
logging.warning(
'Não foi possível coletar dados de %s. Aguardando %d segundos para tentativa %d de %d' % (
url,
SLEEP_TIME,
t,
MAX_RETRIES
)
)
sleep(SLEEP_TIME)
else:
return response.json()


def save(data, output):
Expand All @@ -98,7 +104,6 @@ def save(data, output):
with open(output, 'w') as fout:
robots_patterns = _extract_patterns(data)
fout.writelines(robots_patterns)
logging.info('Lista de robôs obtida com sucesso: %s' % output)
except Exception as e:
logging.error(e)

Expand All @@ -110,13 +115,12 @@ def main():
'-u',
'--url',
default=COUNTER_ROBOTS_URL,
help='URL da lista de robots',
help='URL da lista de robôs',
)

parser.add_argument(
'-o',
'--output',
default=OUTPUT_FILENAME,
'--path_output',
required=True,
help='Arquivo de saída',
)

Expand All @@ -128,8 +132,11 @@ def main():
datefmt='%d/%b/%Y %H:%M:%S'
)

check_dir(params.output)

data = get_robots(params.url)
try:
data = get_robots(params.url)
except FileRobotsWasNotDownloadError:
logging.error('Não foi possível obter a lista de robôs de %s' % params.url)
exit(1)

save(data, params.output)
logging.info('Gravando lista de robôs em %s' % params.path_output)
save(data, params.path_output)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='scielo-usage-counter',
version='0.4.6.3',
version='0.4.7',
description='The SciELO Usage Counter Tool',
author='SciELO',
author_email='[email protected]',
Expand Down

0 comments on commit 8292463

Please sign in to comment.