source code

deezer · Sep 30, 2020 · a9b1aea · a9b1aea
1 parent b35d3b3
commit a9b1aea
Show file tree

Hide file tree

Showing 5 changed files with 601 additions and 0 deletions.
diff --git a/environment.py b/environment.py
@@ -0,0 +1,102 @@
+from functools import reduce
+from scipy.special import expit
+import numpy as np
+
+
+# This class uses user and playlist features datasets to simulate users responses to a list of recommendations
+class ContextualEnvironment():
+    def __init__(self, user_features, playlist_features, user_segment, n_recos):
+        self.user_features = user_features
+        self.playlist_features = playlist_features
+        self.user_segment = user_segment
+        self.n_recos = n_recos
+        self.th_segment_rewards = np.zeros(user_features.shape[0])
+        self.th_rewards = np.zeros(user_features.shape[0])
+        self.compute_optimal_theoretical_rewards()
+        self.compute_segment_optimal_theoretical_rewards()
+
+    # Computes expected reward for each user given their recommendations
+    def compute_theoretical_rewards(self, batch_user_ids, batch_recos):
+        batch_user_features = np.take(self.user_features, batch_user_ids, axis = 0)
+        batch_playlist_features = np.take(self.playlist_features, batch_recos, axis = 0)
+        n_users = len(batch_user_ids)
+        th_reward = np.zeros(n_users)
+        for i in range(n_users):
+            probas = expit(batch_user_features[i].dot(batch_playlist_features[i].T))
+            th_reward[i] = 1 - reduce(lambda x,y : x * y, 1 - probas)
+        return th_reward
+
+    # Computes list of n recommendations with highest expected reward for each user
+    def compute_optimal_recos(self, batch_user_ids, n):
+        batch_user_features = np.take(self.user_features, batch_user_ids, axis = 0)
+        n_users = len(batch_user_ids)
+        probas = batch_user_features.dot(self.playlist_features.T)
+        optim = np.argsort(-probas)[:, :n]
+        return optim
+
+    # Computes highest expected reward for each user
+    def compute_optimal_theoretical_rewards(self):
+        n_users = self.user_features.shape[0]
+        u = 0
+        step = 100000
+        while u < n_users:
+            users_ids = range(u, min(n_users, u + step))
+            opt_recos = self.compute_optimal_recos(users_ids, self.n_recos)
+            opt_rewards = self.compute_theoretical_rewards(users_ids, opt_recos)
+            self.th_rewards[u:min(n_users, u + step)] = opt_rewards
+            u += step
+        return 
+
+    # Computes list of n recommendations with highest expected reward for each segment
+    def compute_segment_optimal_recos(self, n):
+        n_segments = len(np.unique(self.user_segment))
+        segment_recos = np.zeros((n_segments, n), dtype = np.int64)
+        for i in range(n_segments):
+            mean_probas = np.mean(expit(np.take(self.user_features, np.where(self.user_segment == i)[0], axis = 0).dot(self.playlist_features.T)), axis = 0)
+            reward = 1 - reduce(lambda x,y : x * y, 1 + np.sort(-mean_probas)[:n])
+            segment_recos[i] = np.argsort(-mean_probas)[:n]
+        return segment_recos
+
+    # Computes highest expected reward for each segment
+    def compute_segment_optimal_theoretical_rewards(self):
+        n_users = self.user_features.shape[0]
+        u = 0
+        step = 100000
+        segment_recos = self.compute_segment_optimal_recos(self.n_recos)
+        while u < n_users:
+            users_ids = range(u, min(n_users, u+ step))
+            user_segment = np.take(self.user_segment, users_ids)
+            opt_recos = np.take(segment_recos, user_segment, axis = 0)
+            opt_rewards = self.compute_theoretical_rewards(users_ids, opt_recos)
+            self.th_segment_rewards[u:min(n_users, u+ step)] = opt_rewards
+            u += step
+        return 
+
+    # Given a list of users and their respective list of recos (each of size self.n_recos), computes
+    # corresponding simulated reward
+    def simulate_batch_users_reward(self, batch_user_ids, batch_recos):
+
+        # First, compute probability of streaming each reco and draw rewards accordingly
+        batch_user_features = np.take(self.user_features, batch_user_ids, axis = 0)
+        batch_playlist_features = np.take(self.playlist_features, batch_recos, axis = 0)
+        n_users = len(batch_user_ids)
+        n = len(batch_recos[0])
+        probas = np.zeros((n_users, n))
+        for i in range(n_users):
+            probas[i] = expit(batch_user_features[i].dot(batch_playlist_features[i].T)) # probability to stream each reco
+        rewards = np.zeros((n_users, n))
+        i = 0
+        rewards_uncascaded = np.random.binomial(1, probas) # drawing rewards from probabilities
+        positive_rewards = set()
+
+        # Then, for each user, positive rewards after the first one are set to 0 (and playlists as "unseen" subsequently)
+        # to imitate a cascading browsing behavior
+        # (nonetheless, users can be drawn several times in the batch of a same round ; therefore, each user
+        # can have several positive rewards - i.e. stream several playlists - in a same round, consistently with
+        # the multiple-plays framework from the paper)
+        nz = rewards_uncascaded.nonzero()
+        for i in range(len(nz[0])):
+            if nz[0][i] not in positive_rewards:
+                rewards[nz[0][i]][nz[1][i]] = 1
+                positive_rewards.add(nz[0][i])
+        return rewards
diff --git a/main.py b/main.py
@@ -0,0 +1,131 @@
+from environment import ContextualEnvironment
+from policies import KLUCBSegmentPolicy, RandomPolicy, ExploreThenCommitSegmentPolicy, EpsilonGreedySegmentPolicy, TSSegmentPolicy, LinearTSPolicy
+import argparse
+import json
+import logging
+import numpy as np
+import pandas as pd
+import time
+
+# List of implemented policies
+def set_policies(policies_name, user_segment, user_features, n_playlists):
+    # Please see section 3.3 of RecSys paper for a description of policies
+    POLICIES_SETTINGS = {
+        'random' : RandomPolicy(n_playlists),
+        'etc-seg-explore' : ExploreThenCommitSegmentPolicy(user_segment, n_playlists, min_n = 100, cascade_model = True),
+        'etc-seg-exploit' : ExploreThenCommitSegmentPolicy(user_segment, n_playlists, min_n = 20, cascade_model = True),
+        'epsilon-greedy-explore' : EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon = 0.1, cascade_model = True),
+        'epsilon-greedy-exploit' : EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon = 0.01, cascade_model = True),
+        'kl-ucb-seg' : KLUCBSegmentPolicy(user_segment, n_playlists, cascade_model = True),
+        'ts-seg-naive' : TSSegmentPolicy(user_segment, n_playlists, alpha_zero = 1, beta_zero = 1, cascade_model = True),
+        'ts-seg-pessimistic' : TSSegmentPolicy(user_segment, n_playlists, alpha_zero = 1, beta_zero = 99, cascade_model = True),
+        'ts-lin-naive' : LinearTSPolicy(user_features, n_playlists, bias = 0.0, cascade_model = True),
+        'ts-lin-pessimistic' : LinearTSPolicy(user_features, n_playlists, bias = -5.0, cascade_model = True),
+        # Versions of epsilon-greedy-explore and ts-seg-pessimistic WITHOUT cascade model
+        'epsilon-greedy-explore-no-cascade' : EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon = 0.1, cascade_model = False),
+        'ts-seg-pessimistic-no-cascade' : TSSegmentPolicy(user_segment, n_playlists, alpha_zero = 1, beta_zero = 99, cascade_model = False)
+    }
+
+    return [POLICIES_SETTINGS[name] for name in policies_name]
+
+
+if __name__ == "__main__":
+
+    # Arguments
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--users_path", type = str, default = "data/user_features.csv", required = False,
+                        help = "Path to user features file")
+    parser.add_argument("--playlists_path", type = str, default = "data/playlist_features.csv", required = False,
+                        help = "Path to playlist features file")
+    parser.add_argument("--output_path", type = str, default = "results.json", required = False,
+                        help = "Path to json file to save regret values")
+    parser.add_argument("--policies", type = str, default = "random,ts-seg-naive", required = False,
+                        help = "Bandit algorithms to evaluate, separated by commas")
+    parser.add_argument("--n_recos", type = int, default = 12, required = False,
+                        help = "Number of slots L in the carousel i.e. number of recommendations to provide")
+    parser.add_argument("--l_init", type = int, default = 3, required = False,
+                        help = "Number of slots L_init initially visible in the carousel")
+    parser.add_argument("--n_users_per_round", type = int, default = 20000, required = False,
+                        help = "Number of users randomly selected (with replacement) per round")
+    parser.add_argument("--n_rounds", type = int, default = 100, required = False,
+                        help = "Number of simulated rounds")
+    parser.add_argument("--print_every", type = int, default = 10, required = False,
+                        help = "Print cumulative regrets every 'print_every' round")
+
+    args = parser.parse_args()
+
+    logging.basicConfig(level = logging.INFO)
+    logger = logging.getLogger(__name__)
+
+    if args.l_init > args.n_recos:
+        raise ValueError('l_init is larger than n_recos')
+
+
+    # Data Loading and Preprocessing steps
+
+    logger.info("LOADING DATA")
+    logger.info("Loading playlist data")
+    playlists_df = pd.read_csv(args.playlists_path)
+
+    logger.info("Loading user data\n \n")
+    users_df = pd.read_csv(args.users_path)
+
+    n_users = len(users_df)
+    n_playlists = len(playlists_df)
+    n_recos = args.n_recos
+    print_every = args.print_every
+
+    user_features = np.array(users_df.drop(["segment"], axis = 1))
+    user_features = np.concatenate([user_features, np.ones((n_users,1))], axis = 1)
+    playlist_features = np.array(playlists_df)
+
+    user_segment = np.array(users_df.segment)
+
+    logger.info("SETTING UP SIMULATION ENVIRONMENT")
+    logger.info("for %d users, %d playlists, %d recommendations per carousel \n \n" % (n_users, n_playlists, n_recos))
+
+    cont_env = ContextualEnvironment(user_features, playlist_features, user_segment, n_recos)
+
+    logger.info("SETTING UP POLICIES")
+    logger.info("Policies to evaluate: %s \n \n" % (args.policies))
+
+    policies_name = args.policies.split(",")
+    policies = set_policies(policies_name, user_segment, user_features, n_playlists)
+    n_policies = len(policies)
+    n_users_per_round = args.n_users_per_round
+    n_rounds = args.n_rounds
+    overall_rewards = np.zeros((n_policies, n_rounds))
+    overall_optimal_reward = np.zeros(n_rounds)
+
+
+    # Simulations for Top-n_recos carousel-based playlist recommendations
+
+    logger.info("STARTING SIMULATIONS")
+    logger.info("for %d rounds, with %d users per round (randomly drawn with replacement)\n \n" % (n_rounds, n_users_per_round))
+    start_time = time.time()
+    for i in range(n_rounds):
+        # Select batch of n_users_per_round users
+        user_ids = np.random.choice(range(n_users), n_users_per_round)
+        overall_optimal_reward[i] = np.take(cont_env.th_rewards, user_ids).sum()
+        # Iterate over all policies
+        for j in range(n_policies):
+            # Compute n_recos recommendations
+            recos = policies[j].recommend_to_users_batch(user_ids, args.n_recos, args.l_init)
+            # Compute rewards
+            rewards = cont_env.simulate_batch_users_reward(batch_user_ids= user_ids, batch_recos=recos)
+            # Update policy based on rewards
+            policies[j].update_policy(user_ids, recos, rewards, args.l_init)
+            overall_rewards[j,i] = rewards.sum()
+        # Print info
+        if i == 0 or (i+1) % print_every == 0 or i+1 == n_rounds:
+            logger.info("Round: %d/%d. Elapsed time: %f sec." % (i+1, n_rounds, time.time() - start_time))
+            logger.info("Cumulative regrets: \n%s \n" % "\n".join(["	%s : %s" % (policies_name[j], str(np.sum(overall_optimal_reward - overall_rewards[j]))) for j in range(n_policies)]))
+
+
+    # Save results
+
+    logger.info("Saving cumulative regrets in %s" % args.output_path)
+    cumulative_regrets = {policies_name[j] : list(np.cumsum(overall_optimal_reward - overall_rewards[j])) for j in range(n_policies)}
+    with open(args.output_path, 'w') as fp:
+        json.dump(cumulative_regrets, fp)
diff --git a/online_logistic_regression.py b/online_logistic_regression.py
@@ -0,0 +1,48 @@
+from scipy.optimize import minimize
+import numpy as np
+
+
+# Disclaimer: this class is taken from:
+# https://gdmarmerola.github.io/ts-for-contextual-bandits/
+
+
+# Defining a class for Online Bayesian Logistic Regression
+class OnlineLogisticRegression:
+
+    # Initializing
+    def __init__(self, lambda_, alpha, n_dim, bias, maxiter = 15):
+
+        # Hyperparameter: deviation on the prior (L2 regularizer)
+        self.lambda_ = lambda_; self.alpha = alpha; self.maxiter = maxiter
+
+        # Initializing parameters of the model
+        self.n_dim = n_dim
+        # m: mean of the Bi, q inverse variance of the distribution
+        self.m = np.zeros(self.n_dim)
+        self.m[-1] = bias
+        self.q = np.ones(self.n_dim) * self.lambda_
+
+        # Initializing weights
+        self.w = np.random.normal(self.m, self.alpha * (self.q)**(-1.0), size = self.n_dim)
+
+    # Loss function
+    def loss(self, w, *args):
+        X, y = args
+        # Note: the bias is removed from the "regularization term" of the loss
+        return 0.5 * (self.q[:-1] * (w[:-1] - self.m[:-1])).dot(w[:-1] - self.m[:-1]) + np.sum([np.log(1 + np.exp(-y[j] * w.dot(X[j]))) for j in range(y.shape[0])])
+
+    # Gradient
+    def grad(self, w, *args):
+        X, y = args
+        return np.concatenate((self.q[:-1] * (w[:-1] - self.m[:-1]),0.0),axis = None) + (-1) * np.array([y[j] *  X[j] / (1. + np.exp(y[j] * w.dot(X[j]))) for j in range(y.shape[0])]).sum(axis = 0)
+
+    # Fitting method
+    def fit(self, X, y):
+
+        # Step 1, find w
+        self.w = minimize(self.loss, self.w, args = (X, y), jac = self.grad, method = "L-BFGS-B", options = {'maxiter': self.maxiter}).x
+        self.m = self.w
+
+        # Step 2, update q
+        P = (1 + np.exp(1 - X.dot(self.m))) ** (-1)
+        self.q = self.q + (P*(1-P)).dot(X ** 2)
diff --git a/plot_results.py b/plot_results.py
@@ -0,0 +1,26 @@
+import argparse
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+sns.set_style(style='darkgrid')
+
+
+# Plots the evolution of expected cumulative regrets curves,
+# for all tested policies and over all rounds
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", type=str, default="results.json", required=False,
+                        help="path to data")
+
+    args = parser.parse_args()
+
+    with open(args.data_path, 'r') as fp:
+        cumulative_regrets = json.load(fp)
+
+    for k,v in cumulative_regrets.items():
+        sns.lineplot(data = np.array(v), label=k)
+    plt.xlabel("Round")
+    plt.ylabel("Cumulative Regret")
+    plt.show()