-
Notifications
You must be signed in to change notification settings - Fork 6.4k
/
Copy pathepsilon_greedy_starter.py
executable file
·88 lines (65 loc) · 2.17 KB
/
epsilon_greedy_starter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# From the course: Bayesin Machine Learning in Python: A/B Testing
# https://deeplearningcourses.com/c/bayesian-machine-learning-in-python-ab-testing
# https://www.udemy.com/bayesian-machine-learning-in-python-ab-testing
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import matplotlib.pyplot as plt
import numpy as np
NUM_TRIALS = 10000
EPS = 0.1
BANDIT_PROBABILITIES = [0.2, 0.5, 0.75]
class BanditArm:
def __init__(self, p):
# p: the win rate
self.p = p
self.p_estimate = # TODO
self.N = # TODO
def pull(self):
# draw a 1 with probability p
return np.random.random() < self.p
def update(self, x):
self.N = # TODO
self.p_estimate = # TODO
def experiment():
bandits = [BanditArm(p) for p in BANDIT_PROBABILITIES]
rewards = np.zeros(NUM_TRIALS)
num_times_explored = 0
num_times_exploited = 0
num_optimal = 0
optimal_j = np.argmax([b.p for b in bandits])
print("optimal j:", optimal_j)
for i in range(NUM_TRIALS):
# use epsilon-greedy to select the next bandit
if np.random.random() < EPS:
num_times_explored += 1
j = # TODO
else:
num_times_exploited += 1
j = # TODO
if j == optimal_j:
num_optimal += 1
# pull the arm for the bandit with the largest sample
x = bandits[j].pull()
# update rewards log
rewards[i] = x
# update the distribution for the bandit whose arm we just pulled
bandits[j].update(x)
# print mean estimates for each bandit
for b in bandits:
print("mean estimate:", b.p_estimate)
# print total reward
print("total reward earned:", rewards.sum())
print("overall win rate:", rewards.sum() / NUM_TRIALS)
print("num_times_explored:", num_times_explored)
print("num_times_exploited:", num_times_exploited)
print("num times selected optimal bandit:", num_optimal)
# plot the results
cumulative_rewards = np.cumsum(rewards)
win_rates = cumulative_rewards / (np.arange(NUM_TRIALS) + 1)
plt.plot(win_rates)
plt.plot(np.ones(NUM_TRIALS)*np.max(BANDIT_PROBABILITIES))
plt.show()
if __name__ == "__main__":
experiment()