forked from casselldev/big-data-systems-course-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vis.py
198 lines (161 loc) · 7.75 KB
/
vis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# /usr/bin/env python3
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#
# Containerized Amazon Recommender System (CARS) Project
#
# Authors: Brianna Blain-Castelli, Nikkolas Irwin, Adam Cassell, and Andrew Munoz
# Date: 04/01/2020
# Purpose: Build a Big Data application using a Conda environment and Docker.
# Course: CS 636 Big Data Systems
# Project: CARS is an application that builds a recommender system from datasets provided by
# UCSD (see citation below).
#
# Dataset URL: https://nijianmo.github.io/amazon/index.html
#
# ***IMPORTANT*** You must download the dataset files for a particular category to your local machine yourself due
# to their size. As long as your dataset files are in the same directory as the Dockerfile, then
# they will be added to the volume and usable by the container as expected.
#
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#
# Citation: Justifying recommendations using distantly-labeled reviews and fined-grained aspects
# Jianmo Ni, Jiacheng Li, Julian McAuley
# Empirical Methods in Natural Language Processing (EMNLP), 2019
# PDF: http://cseweb.ucsd.edu/~jmcauley/pdfs/emnlp19a.pdf
#
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#
# Python Vis Module: Python module containing Vis class. To be used for all visualization types.
#
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
import pandas
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.style as sty
from plotly.subplots import make_subplots
from IPython.display import display
class Vis:
vis_dict = {}
def __init__(self, type, data, spark=None):
self.type = type # instance variable unique to each instance
self.data = data
if (self.type == "summary"):
self.vis_summary(self.data)
elif (self.type == "helpful"):
self.vis_helpful_review(self.data,spark)
elif (self.type == "prediction"):
self.vis_prediction(self.data)
elif (self.type == "time"):
self.vis_timeseries(self.data,spark)
else:
raise Exception("Invalid visualization type")
def vis_summary(self, data):
# fig1, ax1 = plt.subplots()
# data.boxplot(ax=ax1)
# ax1.set_title('Ratings Summary')
fig1, ax1 = plt.subplots()
num_bins = [0.5, 1.5, 2.5, 3.5, 4.5, 5.5]
data.hist(ax=ax1,bins=num_bins, edgecolor='white')
ax1.set_title('Ratings Distribution')
plt.ylabel('Review Count')
plt.xlabel('Ratings')
ax1.grid(False)
plt.show()
def vis_helpful_review(self,data,spark):
data.createOrReplaceTempView('TBL_HELPFUL_REVIEWS')
result = spark.sql('''SELECT reviewerID, overall, vote FROM TBL_HELPFUL_REVIEWS WHERE asin =
(SELECT asin AS `itemID`
FROM TBL_HELPFUL_REVIEWS
GROUP BY ASIN
ORDER BY count(ASIN) DESC LIMIT 1) ''')
#print(f'\n\nShowing the popularity over time of the most-reviewed item of the dataset...', '\n')
df = result.toPandas()
fig0 = make_subplots(rows=1, cols=2)
fig0.add_trace(
go.Scattergl(x=df['overall'], y=df['vote'], mode='markers', name="Rating/Vote Correlation"),
row=1, col=1
)
fig0.add_trace(
go.Bar(x=df['overall'], y=df['vote'], name= "Vote Disbrution Across Ratings"),
row=1, col=2
)
fig0.update_layout(width=900,height=450,title_text="Ratings vs. Votes for Most Popular Item")
fig0.update_xaxes(title_text="Ratings")
fig0.update_yaxes(title_text="Votes")
fig0.show()
# fig0 = px.scatter(df, x="overall", y="vote", title="Rating/Vote Correlation" ,width=400,height=400, render_mode="webgl")
# fig0.show()
# fig_bar = px.bar(df, x="overall", y="vote", title="Vote Disbrution Across Ratings", width=400,height=400)
# fig_bar.show()
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add traces
fig.add_trace(
go.Scattergl(x=df['reviewerID'], y=df['overall'], name="Ratings", mode='markers'),
secondary_y=False,
)
fig.add_trace(
go.Scattergl(x=df['reviewerID'], y=df['vote'], name="Votes", mode='markers'),
secondary_y=True,
)
# Add figure title
fig.update_layout(
title_text="Ratings/Votes Correlation"
)
# Set x-axis title
fig.update_xaxes(title_text="Reviews")
# Set y-axes titles
fig.update_yaxes(title_text="<b>Ratings</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>Votes</b>", secondary_y=True)
fig.show()
def vis_prediction(self, data):
df = data.select(data['overall'], data['prediction']).toPandas()
print("\nPlotting visualizations...")
fig = px.scatter(df, x="overall", y="prediction", width=400, height=400,render_mode='webgl')
fig.show()
df['error'] = df['prediction'] - df['overall']
fig2 = px.histogram(df, x="error")
fig2.update_layout(
bargap=0.1,
title={
'text': "ALS Prediction Error Distribution",
'y':0.98,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig2.show()
def vis_timeseries(self, data, spark):
k = 10
data.createOrReplaceTempView('TBL_RATING_TIMESERIES')
top_items = spark.sql('''SELECT count(ASIN) AS `Review Count`,
asin AS `ASIN`
FROM TBL_RATING_TIMESERIES
GROUP BY ASIN
ORDER BY `Review Count` DESC''')
print(f'\n\nShowing the {k} most reviewed items for the dataset.', '\n')
#top_items.show(n=k)
ti = top_items.toPandas()
display(ti[0:k])
# Note: 'Most Popular Item' in the below query really refers to the 'Review Count' of the most popular item.
# Using 'Most Popular Item' alias for simplicity for plot labeling purposes
data.createOrReplaceTempView('TBL_RATING_TIMESERIES')
df = spark.sql('''SELECT count(ASIN) as `Most Popular Item`, unixReviewTime AS `Review Date` FROM TBL_RATING_TIMESERIES WHERE asin =
(SELECT asin AS `itemID`
FROM TBL_RATING_TIMESERIES
GROUP BY ASIN
ORDER BY count(ASIN) DESC LIMIT 1)
GROUP BY `Review Date` ORDER BY `Review Date` ''')
print(f'\n\nShowing the popularity over time of the most-reviewed item of the dataset...', '\n')
#df.show(n=k)
# Convert milliseconds to date
df_pandas = df.toPandas()
# Convert date string to pyspark date type
df_pandas['Review Date'] = pandas.to_datetime(df_pandas['Review Date'], unit='s')
# display(df_pandas)
# gca stands for 'get current axis'
ax = plt.gca()
df_pandas.plot(kind='line',x='Review Date',y='Most Popular Item',figsize=(16, 4),ax=ax)
plt.ylabel('Review Count')
#df.plot(kind='line',x='RT',y='num_pets', color='red', ax=ax)
plt.show()