Skip to content

Commit

Permalink
Merge pull request #31 from pitangainnovare/impl/regex_domain
Browse files Browse the repository at this point in the history
Implementa regex para detectar logs ligeiramente diferentes
  • Loading branch information
pitangainnovare authored Feb 21, 2024
2 parents 8292463 + 115fc70 commit deea648
Show file tree
Hide file tree
Showing 13 changed files with 211 additions and 3 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ virtualenv -p python3 .venv
# Access the virtual environment
source .venv/bin/activated

# Please ensure that the MySQL developer library is installed on your system. For Ubuntu-based distributions, you can install it using the following command
sudo apt install libmysql++-dev

# Install dependencies
pip install -r requirements.txt

Expand Down
5 changes: 5 additions & 0 deletions app/lib/logparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from app.values import (
EXTENSIONS_DOWNLOAD,
PATTERN_NCSA_EXTENDED_LOG_FORMAT,
PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN,
EXTENSIONS_STATIC,
)
from app.lib.file import open_logfile
Expand Down Expand Up @@ -434,6 +435,10 @@ def parse_line(self, line):
decoded_line = line.decode('utf-8', errors='ignore').strip() if isinstance(line, bytes) else line.strip()

match = re.match(PATTERN_NCSA_EXTENDED_LOG_FORMAT, decoded_line)

if not match:
match = re.match(PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN, decoded_line)

if match:
hit = Hit()

Expand Down
5 changes: 5 additions & 0 deletions app/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@
PATTERN_COMMON_LOG_FORMAT + r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)

# Pattern designed to capture rows that begin with the domain name
PATTERN_NCSA_EXTENDED_LOG_FORMAT_DOMAIN = (
r'(?P<domain>.*?)\s' + PATTERN_COMMON_LOG_FORMAT + r'\s+"(?P<referrer>.*?)"\s+"(?P<user_agent>.*?)"'
)

# https://github.com/matomo-org/matomo-log-analytics/blob/4.x-dev/import_logs.py
EXTENSIONS_STATIC = set([
'gif',
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ geoip2==4.4.0
mysqlclient==2.0.3
requests==2.26.0
reverse_geocoder==1.4
-e git+https://github.com/scieloorg/scielo_log_validator.git@0.2.5#egg=scielo_log_validator
-e git+https://github.com/scieloorg/scielo_log_validator.git@0.3.0#egg=scielo_log_validator
sqlalchemy==1.4.26
wget==3.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name='scielo-usage-counter',
version='0.4.7',
version='0.5.0',
description='The SciELO Usage Counter Tool',
author='SciELO',
author_email='[email protected]',
Expand Down
48 changes: 48 additions & 0 deletions tests/fixtures/usage.cub.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
170.239.0.1 - - [14/Feb/2024:06:34:04 -0500] "GET /img/es/toc.gif HTTP/1.1" 200 227 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S2218-36202021000500463" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
170.239.0.1 - - [14/Feb/2024:06:34:05 -0500] "GET /img/es/author.gif HTTP/1.1" 200 202 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S2218-36202021000500463" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:05 -0500] "GET /css/screen.css HTTP/1.1" 200 89 "-" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"170.239.0.1 - - [14/Feb/2024:06:34:05 -0500] "GET /favicon.ico HTTP/1.1" 200 7886 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S2218-36202021000500463" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:06 -0500] "GET /applications/scielo-org/js/jquery-1.4.2.min.js HTTP/1.1" 200 72174 "-" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:06 -0500] "GET /applications/scielo-org/js/toolbox.js HTTP/1.1" 200 3653 "-" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:06 -0500] "GET /scielo.php?script=sci_arttext&pid=S0864-21252003000200011 HTTP/1.1" 200 20584 "-" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /css/screen/general.css HTTP/1.1" 200 133 "http://scielo.sld.cu/css/screen.css" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /css/screen/styles.css HTTP/1.1" 200 3572 "http://scielo.sld.cu/css/screen.css" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/es/artsrc.gif HTTP/1.1" 200 293 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0864-21252003000200011" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/es/fbpelogp.gif HTTP/1.1" 200 1353 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0864-21252003000200011" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/es/grp1c.gif HTTP/1.1" 200 214 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0864-21252003000200011" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/es/next.gif HTTP/1.1" 200 233 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0864-21252003000200011" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/es/prev.gif HTTP/1.1" 200 235 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0864-21252003000200011" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
44.204.0.1 - - [14/Feb/2024:06:34:02 -0500] "GET /robots.txt HTTP/1.1" 200 30 "-" "claudebot" "-"
44.204.0.1 - - [14/Feb/2024:06:34:02 -0500] "GET /img/es/iconRelatedOff.gif HTTP/1.1" 200 262 "-" "claudebot" "-"
52.167.0.1 - - [14/Feb/2024:06:34:02 -0500] "GET /scielo.php?script=sci_issuetoc&pid=0864-028920060001&lng=pt HTTP/1.1" 200 4544 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/116.0.1938.76 Safari/537.36" "-"
44.204.0.1 - - [14/Feb/2024:06:34:04 -0500] "GET /robots.txt HTTP/1.1" 200 30 "-" "claudebot" "-"
44.204.0.1 - - [14/Feb/2024:06:34:04 -0500] "GET /img/es/fbpelogp.gif HTTP/1.1" 200 1353 "-" "claudebot" "-"
85.192.0.1 - - [14/Feb/2024:06:34:05 -0500] "GET /img/collapsed2.png HTTP/1.1" 200 339 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
47.128.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /cgi-bin/wxis.exe/iah/?IsisScript=iah%2Fiah.xis&base=article%5Edlibrary&exprSearch=RODRIGUEZ-PADILLA%2C+CRISTINA&format=iso.pft&indexSearch=AU&nextAction=lnk HTTP/1.1" 200 122 "-" "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; [email protected])" "-"
47.128.0.1 - - [14/Feb/2024:06:34:01 -0500] "GET /iah/I/image/config.gif HTTP/1.1" 200 332 "http://scielo.sld.cu/cgi-bin/wxis.exe/iah/?IsisScript=iah%2Fiah.xis&base=article%5Edlibrary&exprSearch=DIAZ+PAEZ%2C+DEISI&format=iso.pft&indexSearch=AU&lang=i&nextAction=lnk" "Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/537.36 (KHTML, like Gecko) Mobile Safari/537.36 (compatible; Bytespider; [email protected])" "-"
51.222.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /scielo.php?script=sci_arttext&pid=S0864-03192010000400008 HTTP/1.1" 200 20206 "-" "Mozilla/5.0 (compatible; AhrefsBot/7.0; +http://ahrefs.com/robot/)" "-"170.239.0.1 - - [14/Feb/2024:06:34:04 -0500] "GET /img/es/artsrc.gif HTTP/1.1" 200 293 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S2218-36202021000500463" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
52.167.0.1 - - [14/Feb/2024:06:34:06 -0500] "GET /scielo.php?script=sci_arttext&pid=S2304-01062022000300011 HTTP/1.1" 200 24522 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/116.0.1938.76 Safari/537.36" "-"
54.82.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /robots.txt HTTP/1.1" 200 30 "-" "claudebot" "-"
3.85.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /robots.txt HTTP/1.1" 200 30 "-" "claudebot" "-"
54.82.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/en/author.gif HTTP/1.1" 200 219 "-" "claudebot" "-"
3.85.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/pt/fbpelogp.gif HTTP/1.1" 200 1353 "-" "claudebot" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /css/screen/layout.css HTTP/1.1" 200 427 "http://scielo.sld.cu/css/screen.css" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
54.82.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /robots.txt HTTP/1.1" 200 30 "-" "claudebot" "-"
54.82.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /img/en/next.gif HTTP/1.1" 200 193 "-" "claudebot" "-"
181.42.0.1 - - [14/Feb/2024:06:34:07 -0500] "GET /article.js HTTP/1.1" 200 8231 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0864-21252003000200011" "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Mobile Safari/537.36" "-"
66.249.0.1 - - [14/Feb/2024:06:34:04 -0500] "GET /scielo.php?script=sci_arttext&pid=S1561-31942010000100030 HTTP/1.1" 200 26956 "-" "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.139 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" "-"
85.192.0.1 - - [14/Feb/2024:06:34:01 -0500] "GET /applications/scielo-org/js/jquery-1.4.2.min.js HTTP/1.1" 200 72174 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:01 -0500] "GET /applications/scielo-org/js/toolbox.js HTTP/1.1" 200 3653 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:01 -0500] "GET /css/screen/general.css HTTP/1.1" 200 133 "http://scielo.sld.cu/css/screen.css" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:01 -0500] "GET /css/screen/layout.css HTTP/1.1" 200 427 "http://scielo.sld.cu/css/screen.css" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:02 -0500] "GET /img/es/alpha.gif HTTP/1.1" 200 209 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:02 -0500] "GET /img/es/home.gif HTTP/1.1" 200 190 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:02 -0500] "GET /img/es/next.gif HTTP/1.1" 200 233 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:02 -0500] "GET /img/es/search.gif HTTP/1.1" 200 247 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /img/common/iconPermalink.gif HTTP/1.1" 200 382 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /img/es/e-mailt.gif HTTP/1.1" 200 586 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /img/es/fulltxt.gif HTTP/1.1" 200 643 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /img/es/iconCitedOff.gif HTTP/1.1" 200 288 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /img/es/iconEmail.gif HTTP/1.1" 200 660 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /img/es/iconLogin.gif HTTP/1.1" 200 601 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:03 -0500] "GET /img/es/iconReferences.gif HTTP/1.1" 200 374 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
85.192.0.1 - - [14/Feb/2024:06:34:05 -0500] "GET /google_metrics/get_h5_m5.php?issn=0034-7531&callback=jsonp1707910443945 HTTP/1.1" 200 152 "http://scielo.sld.cu/scielo.php?script=sci_arttext&pid=S0034-75312003000100008" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" "-"
3 changes: 3 additions & 0 deletions tests/fixtures/usage.cub.log.processed
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
serverTime browserName browserVersion ip latitude longitude actionName
2024-02-14 11:34:06 CM 121.0.0.0 181.42.0.1 -33.4155 -70.6056 /scielo.php?script=sci_arttext&pid=S0864-21252003000200011
2024-02-14 11:34:05 CH 121.0.0.0 85.192.0.1 55.8193 37.6474 /google_metrics/get_h5_m5.php?issn=0034-7531&callback=jsonp1707910443945
2 changes: 2 additions & 0 deletions tests/fixtures/usage.cub.log.processed.summary
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ignored_lines_static_resources ignored_lines_bot ignored_lines_invalid_method ignored_lines_invalid_user_agent ignored_lines_invalid_client_name ignored_lines_invalid_client_version ignored_lines_invalid_geolocation ignored_lines_invalid_local_datetime ignored_lines_http_redirects ignored_lines_http_errors total_ignored_lines total_imported_lines lines_parsed total_time
36 16 0 0 0 0 0 0 0 0 46 2 48 0.024617910385131836
Loading

0 comments on commit deea648

Please sign in to comment.