Skip to content

Commit

Permalink
Merge pull request #32 from pitangainnovare/v0.6.0
Browse files Browse the repository at this point in the history
Refatora código e adiciona testes
  • Loading branch information
pitangainnovare authored Jan 10, 2025
2 parents deea648 + b30c974 commit 8634c0d
Show file tree
Hide file tree
Showing 31 changed files with 466 additions and 115 deletions.
10 changes: 0 additions & 10 deletions app/controller.py

This file was deleted.

10 changes: 0 additions & 10 deletions app/lib/robot.py

This file was deleted.

8 changes: 0 additions & 8 deletions app/utils.py

This file was deleted.

2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ geoip2==4.4.0
mysqlclient==2.0.3
requests==2.26.0
reverse_geocoder==1.4
-e git+https://github.com/scieloorg/scielo_log_validator.git@0.3.0#egg=scielo_log_validator
-e git+https://github.com/scieloorg/scielo_log_validator.git@0.4.0#egg=scielo_log_validator
sqlalchemy==1.4.26
wget==3.2
File renamed without changes.
File renamed without changes.
11 changes: 6 additions & 5 deletions app/lib/db.py → scielo_usage_counter/database/db.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@

import app.declararive as models
import app.values as values
import datetime

from sqlalchemy import and_, create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound

import datetime

from scielo_usage_counter import values

import scielo_usage_counter.database.declararive as models


def create_tables(str_connection):
engine = create_engine(str_connection)
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
123 changes: 91 additions & 32 deletions app/lib/logparser.py → scielo_usage_counter/log.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,15 @@
import datetime
import ipaddress
import re
import logging
import time
import urllib.parse

from app.values import (
EXTENSIONS_DOWNLOAD,
PATTERN_NCSA_EXTENDED_LOG_FORMAT,
PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN,
EXTENSIONS_STATIC,
)
from app.lib.file import open_logfile
from app.lib.geo import GeoIp
from app.lib.robot import robot_reader
from app.lib.exceptions import DeviceDetectionError
from device_detector import DeviceDetector

from . import exceptions, geo, values
from .utils import file_utils, resource_utils


class Stats:
def __init__(self):
Expand All @@ -33,6 +27,8 @@ def __init__(self):
self.__total_imported_lines = 0
self.__lines_parsed = 0
self.__total_time = 0.0
self.__output = None


@property
def ignored_lines_static_resources(self):
Expand Down Expand Up @@ -154,7 +150,8 @@ def output(self):
def output(self, path):
try:
self.__output = open(path, 'w')
except:
except Exception as e:
logging.error(f"Failed to open file: {e}")
logging.info(self.dump_to_str())

def increment(self, measure):
Expand Down Expand Up @@ -205,10 +202,20 @@ def dump_to_str(self, sep='\t'):
logging(sep.join(i))

def save(self, sep='\t'):
if self.output is None:
logging.error('You should define an output path before trying to save.\n\tTip: lp.output = <YOUR PATH GOES HERE>\n\t lp.stats.output = <YOUR SUMMARY PATH GOES HERE>')
return

stats_kv = self.get_stats()
for i in stats_kv:
self.output.write(sep.join([str(x) for x in i]) + '\n')
self.output.close()

try:
for i in stats_kv:
self.output.write(sep.join([str(x) for x in i]) + '\n')
except Exception as e:
logging.error(f"Failed to write stats to file: {e}")
finally:
if self.output:
self.output.close()


class Hit:
Expand Down Expand Up @@ -297,11 +304,18 @@ def action(self, value):


class LogParser:
def __init__(self, mmdb_path, robots_path):
self.__geoip = GeoIp()
self.__geoip.map = mmdb_path
self.__robots = robot_reader(robots_path)
def __init__(self, mmdb_path=None, robots_path=None, mmdb_data=None, robots_list=None):
self.__geoip = geo.GeoIp()
self.__geoip.map = resource_utils.load_mmdb(
mmdb_data=mmdb_data,
mmdb_path=mmdb_path,
)
self.__robots = resource_utils.load_robots(
robots_list=robots_list,
robots_path=robots_path,
)
self.__stats = Stats()
self.__output = None

@property
def output(self):
Expand All @@ -317,7 +331,7 @@ def logfile(self):

@logfile.setter
def logfile(self, file_path):
self.__logfile = open_logfile(file_path)
self.__logfile = file_utils.open_logfile(file_path)

@property
def geoip(self):
Expand All @@ -328,8 +342,11 @@ def robots(self):
return self.__robots

@robots.setter
def robots(self, robots_path):
self.__robots = robot_reader(robots_path)
def robots(self, robots_list, robots_path):
self.__robots = resource_utils.load_robots(
robots_list=robots_list,
robots_path=robots_path,
)

@property
def stats(self):
Expand Down Expand Up @@ -361,9 +378,8 @@ def has_valid_user_agent(self, user_agent):
return False

def user_agent_is_bot(self, user_agent):
user_agent_lowered = user_agent.lower()
for regex in self.robots:
if regex.search(user_agent_lowered):
if regex.search(user_agent):
return True
return False

Expand All @@ -380,7 +396,7 @@ def action_is_static_file(self, path):

ext = file_from_url.rsplit('.')[-1].lower()

if ext in EXTENSIONS_STATIC or file_from_url in EXTENSIONS_STATIC:
if ext in values.EXTENSIONS_STATIC or file_from_url in values.EXTENSIONS_STATIC:
return True

return False
Expand All @@ -389,7 +405,7 @@ def action_is_download(self, path):
file_from_url = path.split('/')[-1]
ext = file_from_url.rsplit('.')[-1].lower()

if ext in EXTENSIONS_DOWNLOAD:
if ext in values.EXTENSIONS_DOWNLOAD:
return True
return False

Expand Down Expand Up @@ -420,11 +436,57 @@ def format_user_agent(self, user_agent):
return fmt_ua

def format_client_name(self, device):
return device.client_short_name() or device.UNKNOWN
return device.client_short_name() or device.client_name() or device.UNKNOWN

def format_client_version(self, device):
return device.client_version() or device.UNKNOWN

def match_with_best_pattern(self, line):
patterns = [
values.PATTERN_NCSA_EXTENDED_LOG_FORMAT,
values.PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN,
values.PATTERN_NCSA_EXTENDED_LOG_FORMAT_WITH_IP_LIST,
values.PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN_WITH_IP_LIST,
]

match = None
ip_type = 'unknown'
ip_value = ''

for pattern in patterns:
match = re.match(pattern, line)

if match:
content = match.groupdict()

ip_value = content.get('ip')
ip_type = self.get_ip_type(ip_value)

if ip_type != 'unknown':
return match, ip_value

else:
for i in content.get('ip_list', '').split(','):
ip_type = self.get_ip_type(i.strip())
if ip_type != 'unknown':
return match, i.strip()

return match, ip_value


def get_ip_type(self, ip):
try:
ipa = ipaddress.ip_address(ip)
except ValueError:
return 'unknown'

if ipa.is_global:
return 'remote'
elif ipa.is_private or ipa.is_loopback or ipa.is_link_local:
return 'local'

return 'unknown'

def parse_line(self, line):
self.stats.increment('lines_parsed')

Expand All @@ -434,11 +496,8 @@ def parse_line(self, line):
except UnicodeDecodeError:
decoded_line = line.decode('utf-8', errors='ignore').strip() if isinstance(line, bytes) else line.strip()

match = re.match(PATTERN_NCSA_EXTENDED_LOG_FORMAT, decoded_line)
match, ip_value = self.match_with_best_pattern(decoded_line)

if not match:
match = re.match(PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN, decoded_line)

if match:
hit = Hit()

Expand Down Expand Up @@ -468,7 +527,7 @@ def parse_line(self, line):
except ZeroDivisionError:
device = DeviceDetector('').parse()
self.stats.increment('ignored_lines_invalid_user_agent')
logging.error(DeviceDetectionError(f'Não foi possível identificar UserAgent {hit.user_agent} from line {decoded_line}'))
logging.error(exceptions.DeviceDetectionError(f'Não foi possível identificar UserAgent {hit.user_agent} from line {decoded_line}'))
hit.is_valid = False

hit.client_name = self.format_client_name(device)
Expand All @@ -486,7 +545,7 @@ def parse_line(self, line):
self.stats.increment('ignored_lines_static_resources')
hit.is_valid = False

hit.ip = data.get('ip')
hit.ip = ip_value
geocity = self.geoip.ip_to_geolocation(hit.ip)
if not geocity:
self.stats.increment('ignored_lines_invalid_geolocation')
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import requests
import os

from app.lib.file import extract_gzip
from scielo_usage_counter.utils import file_utils


MMDB_DEFAULT_URL_FORMAT = 'https://download.db-ip.com/free/dbip-city-lite-{0}-{1}.mmdb.gz'
Expand Down Expand Up @@ -92,4 +92,4 @@ def main():
exit(1)

logging.info('Extraindo dados de %s' % params.path_output)
extract_gzip(params.path_output, params.path_output.replace('mmdb.gz', 'mmdb'))
file_utils.extract_gzip(params.path_output, params.path_output.replace('mmdb.gz', 'mmdb'))
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import shlex
import subprocess

from app import values
from app.lib import db, file, exceptions
from scielo_usage_counter import exceptions, values
from scielo_usage_counter.database import db
from scielo_usage_counter.utils import file_utils


LOGGING_LEVEL = os.environ.get(
Expand Down Expand Up @@ -114,7 +115,7 @@ def generate_pretables(
ymd = row.get('serverTime').split(' ')[0]

# gera nome de arquivo relacionado a ymd
ymd_output_path = file.generate_filepath_with_filename(
ymd_output_path = file_utils.generate_filepath_with_filename(
directory=output_directory,
filename=ymd,
posfix=UNSORTED_POSFIX,
Expand All @@ -124,7 +125,7 @@ def generate_pretables(
# verifica se arquivo já existe
if not os.path.exists(ymd_output_path):
logging.info('Criado arquivo %s' % ymd_output_path)
file.create_file_with_header(ymd_output_path, header)
file_utils.create_file_with_header(ymd_output_path, header)

# abre arquivo em modo append, caso ainda não esteja aberto. adiciona em dicionário uma referência ao arquivo
if ymd not in output_files:
Expand Down Expand Up @@ -156,7 +157,7 @@ def generate_pretables_db(
non_pretable_dates = db.get_non_pretable_dates(str_connection, collection)
processed_files = []
for npt in non_pretable_dates:
processed_files.extend(file.get_processed_files(npt, processed_logs_directory))
processed_files.extend(file_utils.get_processed_files(npt, processed_logs_directory))

output_files = {}
for pf in set(sorted(processed_files)):
Expand All @@ -177,15 +178,15 @@ def sort_pretables(
):
unsorted_pretables = db.get_unsorted_pretables(str_connection, collection)
for upt_date in unsorted_pretables:
unsorted_pt_path = file.translate_date_to_output_path(
unsorted_pt_path = file_utils.translate_date_to_output_path(
date=upt_date,
output_directory=unsorted_pretables_directory,
posfix=UNSORTED_POSFIX,
)
if not file.is_valid_path(unsorted_pt_path):
if not file_utils.is_valid_path(unsorted_pt_path):
raise exceptions.InvalidFilePath('%s não é um caminho válido' % unsorted_pt_path)

sorted_pt_path = file.translate_date_to_output_path(
sorted_pt_path = file_utils.translate_date_to_output_path(
date=upt_date,
output_directory=output_directory,
)
Expand Down Expand Up @@ -261,7 +262,7 @@ def main():
datefmt='%d/%b/%Y %H:%M:%S',
)

file.check_dir(args.output_directory, force_tail=True)
file_utils.check_dir(args.output_directory, force_tail=True)

if getattr(args, 'parsed_file', None):
logging.info('Inicializado em modo de arquivo')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import os

from app.controller import create_tables
from scielo_usage_counter.database.controller import create_tables


LOGGING_LEVEL = os.environ.get(
Expand Down
Loading

0 comments on commit 8634c0d

Please sign in to comment.