-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogs_analysis.py
100 lines (88 loc) · 3.39 KB
/
logs_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
#
# Script generates an analysis of the logs for the newspaper website.
import psycopg2
def get_pop_articles():
""" Joins the articles and log tables from the news database to count
the number of times each article is displayed and find the 3 most popular.
The articles are identified with a unique slug that also appears in the
path displayed in the log table. As a result the path can be formatted
to match the slug and do the join that way.
"""
db = psycopg2.connect("dbname=news")
c = db.cursor()
c.execute("select title, count(*) \
from articles, log \
where slug=replace(path,'/article/','') \
and path != '/' \
group by title \
order by count(*) desc \
limit 3")
POSTS = c.fetchall()
db.close()
return POSTS
def get_pop_authors():
""" Joins the authors, articles and log tables from the news database to count
the total number of reads per author and rank them. We proceed the same way
to join the log and articles table which will give us reads and instead of
grouping by article title, we group everything by author.
"""
db = psycopg2.connect("dbname=news")
c = db.cursor()
c.execute("select name, count(*) \
from articles, log, authors \
where authors.id=articles.author \
and slug=replace(path,'/article/','') \
and path != '/' \
group by name \
order by count(*) desc \
limit 4")
POSTS = c.fetchall()
db.close()
return POSTS
def get_days_errors():
""" Here we leverage the avg method to generate the occurence of errors
for each day as a percentage. And then use it in a subquery to then find
the days where the occurence of errors is superior than 1%.
"""
db = psycopg2.connect("dbname=news")
c = db.cursor()
c.execute("select error_table.* \
from (select date(time), \
avg(case when status = '200 OK' then 0 else 1 end) as error \
from log \
group by date(time)) as error_table \
where error >= 0.01")
POSTS = c.fetchall()
db.close()
return POSTS
# Create an output file or overwrite the last created one
f = open("output.txt", "w")
# Output the most popular articles
f.write('3 most popular articles of all time:\n')
print('3 most popular articles of all time:\n')
pop_art = get_pop_articles()
for i in pop_art:
f.write('"{}" -- {} views\n'.format(i[0], i[1]))
print('"{}" -- {} views\n'.format(i[0], i[1]))
f.write('\n')
print('\n')
# Output the most popular authors
f.write('Most popular authors of all time:\n')
print('Most popular authors of all time:\n')
pop_auth = get_pop_authors()
for i in pop_auth:
f.write('{} -- {} views\n'.format(i[0], i[1]))
print('{} -- {} views\n'.format(i[0], i[1]))
f.write('\n')
print('\n')
# Output the days with more than 1% errors
f.write('Days where more than 1% of requests lead to errors:\n')
print('Days where more than 1% of requests lead to errors:\n')
days_err = get_days_errors()
for i in days_err:
f.write('{} -- {}% errors\n'.format(i[0].strftime('%B %d, %Y'),
'{0:.1f}'.format(i[1]*100)))
print('{} -- {}% errors\n'.format(i[0].strftime('%B %d, %Y'),
'{0:.1f}'.format(i[1]*100)))
f.close()