Cleaned code, added all features

nvedant07 · May 4, 2019 · 7666968 · 7666968
commit 7666968
Show file tree

Hide file tree

Showing 50 changed files with 42,720 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+util/__pycache__/*.pyc
+util/datasets/__pycache__/*.pyc
+effort_reward_unfairness/__pycache__/*.pyc
+effort_reward_unfairness/learning_env/__pycache__/*.pyc
+.DS_store
diff --git a/Fairness_constraints/.DS_Store b/Fairness_constraints/.DS_Store
diff --git a/Fairness_constraints/Benefit.m b/Fairness_constraints/Benefit.m
@@ -0,0 +1,9 @@
+function B = Benefit(Y, Y_predicted)
+   % Assuming larger labels more desirable.
+    B = Y_predicted - Y + 1;
+    %
+    %epsilon = 1;
+    %B = (Y_predicted + epsilon)./(Y + epsilon) + epsilon;   
+    %
+    %B = exp(Y_predicted - Y);
+end
diff --git a/Fairness_constraints/Data/.DS_Store b/Fairness_constraints/Data/.DS_Store
diff --git a/Fairness_constraints/Data/Preprocess_student.m b/Fairness_constraints/Data/Preprocess_student.m
@@ -0,0 +1,30 @@
+% Preprocessing the Crime and Communities data
+
+load('student')
+% Crime_original.mat does not include non-predictive attributes: communityname,
+% state, countyCode, communityCode, and fold
+
+% Removed target variables other than total number of nonviolent crimes per
+% capita
+X = student(:,1:14);
+Y = student(:, 15);
+G = student(:,2);
+
+
+n = length(Y); % Number of training instances
+k = size(X,2); % Number of features
+
+
+% Homogenization
+X = [X, ones(n,1)];
+
+
+% 10-fold cross validation
+n_folds=5;
+p = randperm(n);
+X = X(p, :);
+Y = Y(p ,:);
+G = G(p ,:);
+F = ceil((1:n)./(n/n_folds));
+
+save(['student',num2str(n_folds),'.mat'],'X', 'Y', 'G', 'F');
diff --git a/Fairness_constraints/Data/Split.m b/Fairness_constraints/Data/Split.m
@@ -0,0 +1,23 @@
+function Split( dataset_name, prcnt )
+% Split the data into train and test sets
+    load([dataset_name,'.mat']);
+
+    n = length(Y);
+    p = randperm(n);
+
+    X_temp = X(p, :);
+    Y_temp = Y(p ,:);
+    G_temp = G(p ,:);
+
+    n_test = ceil(n * prcnt);
+    X_test = X_temp(1:n_test,:);
+    Y_test = Y_temp(1:n_test,:);
+    G_test = G_temp(1:n_test,:);
+
+    X_train = X_temp(n_test+1:n,:);
+    Y_train = Y_temp(n_test+1:n,:);
+    G_train = G_temp(n_test+1:n,:);
+
+    save([dataset_name,'Split.mat'], 'X_train', 'Y_train', 'G_train', 'X_test', 'Y_test', 'G_test');
+end
+
diff --git a/Fairness_constraints/Data/processed_student_data.csv b/Fairness_constraints/Data/processed_student_data.csv
diff --git a/Fairness_constraints/Data/student.mat b/Fairness_constraints/Data/student.mat
diff --git a/Fairness_constraints/Data/student5.mat b/Fairness_constraints/Data/student5.mat
diff --git a/Fairness_constraints/Output/student5.mat b/Fairness_constraints/Output/student5.mat
diff --git a/Fairness_constraints/Output/student5_Yhat.mat b/Fairness_constraints/Output/student5_Yhat.mat
diff --git a/Fairness_constraints/Output/student5_regularized.mat b/Fairness_constraints/Output/student5_regularized.mat
diff --git a/Fairness_constraints/Social_welfare_constrained_ERM_regularized.m b/Fairness_constraints/Social_welfare_constrained_ERM_regularized.m
@@ -0,0 +1,47 @@
+function Social_welfare_constrained_ERM_regularized(dataset_name, Alpha, Tau)
+    % Load X, Y, G, F
+    load(['Data/',dataset_name]);
+
+    n = length(Y);
+    k = size(X,2); % Number of features  
+
+    % loss = zeros(length(Alpha),length(Tau)); 
+    W_all = zeros(k,length(Tau)); 
+    Y_predicted = zeros(n,length(Tau));
+
+
+    n_folds = max(F);
+    for fold=1:n_folds
+        fold
+        X_test = X(F==fold,:);
+        Y_test = Y(F==fold,:);
+        X_train = X(F~=fold,:);
+        Y_train = Y(F~=fold,:);
+
+        n_train = length(Y_train); % Number of train instances
+        n_test = length(Y_test); % Number of test instances
+
+
+        tauIndex = 0;
+        for tau=Tau
+            tauIndex = tauIndex+1
+
+            cvx_begin quiet
+            cvx_precision high         
+            variable W(k)
+                minimize( norm( X_train * W - Y_train, 2 ))
+                 subject to
+                 ones(1,n_train)*(X_train * W) >= n_train*tau
+            cvx_end
+
+            Y_hat = X_test * W;
+
+            % loss(alphaIndex,tauIndex) = mean((Y_test - Y_hat).^2);
+            W_all(:, tauIndex) = W_all(:, tauIndex) + W;
+            Y_predicted(F==fold, tauIndex) = Y_hat;
+        end
+    end
+    W_all = W_all ./n_folds;
+    save(['Output/',dataset_name,'_Yhat.mat'],'Y','Tau', 'W_all');
+end
+
diff --git a/README b/README
@@ -0,0 +1,67 @@
+== Code for paper titled ``On the Long-term Impact of Algorithmic Policies: Effort Unfairness and Feature Segregation through the Lens of Social Learning`` to appear at ICML 2019 ==
+
+Authors:
+Vedant Nanda, MPI-SWS
+Hoda Heidari, ETH Zürich
+Krishna P. Gummadi, MPI-SWS
+
+
+Requirements:
+
+python 3.5.3 or above
+
+Python Libraries needed to run the code:
+
+asn1crypto==0.24.0
+bcrypt==3.1.5
+cffi==1.11.5
+cryptography==2.4.2
+cycler==0.10.0
+idna==2.8
+kiwisolver==1.0.1
+matplotlib==3.0.2
+numpy==1.16.0
+pandas==0.23.4
+paramiko==2.4.2
+pkg-resources==0.0.0
+pyasn1==0.4.5
+pycparser==2.19
+pydotplus==2.0.2
+PyNaCl==1.3.0
+pyparsing==2.3.0
+python-dateutil==2.7.5
+pytz==2018.9
+scikit-learn==0.20.2
+scipy==1.2.0
+seaborn==0.9.0
+six==1.12.0
+sklearn==0.0
+xlrd==1.2.0
+
+
+To install all these, cd to effort_reward_fairness and execute ``pip install -r requirements.txt``. You might need sudo permission for installation.
+
+
+= Reproduction of experiments and results in the paper titled ``On the Long-term Impact of Algorithmic Policies: Effort Unfairness and Feature Segregation through the Lens of Social Learning`` =
+
+Once dependencies have been installed, cd to the ``effort_reward_fairness`` directory.
+
+1. We first need to generate explanations for users in both the train and the test set. To do so, run ``python experiment.py train`` and ``python experiment.py test``. These can be run either simultaneously or sequentially (since they generate explanations for different parts of the dataset). However, you must run both these commands before proceeding to the next step. As a consequence of running ``python experiment.py test`` the plot containing analysis of different disparity measure for different models is generated and stored in ``./effort_reward_fairness/results/StudentPerf/disparity_plots/all_disp_in_one_test.pdf``. This is the plot in Fig 3 in the paper.
+
+2. To generate plot in Fig 1 and 2, run ``python effort_reward_function_plots.py train``. This will generate Average Reward as a function of effort and Average effort as a function of reward. You will find ``Effort_vs_Average_Reward_together.pdf`` and ``Reward_vs_Average_Effort_together.pdf`` in the directory ``./effort_reward_fairness/results/StudentPerf/disparity_plots/``. These can also be seen in Fig 1 and 2 in the paper.
+
+3. Once you have completed step 1, then run ``python long_term_impact.py``. This will generate new set of feature vectors which correspond to action taken by users on explanations given in step 1.
+
+4. After step 3 completes, execute ``python utility_thresholds.py`` to see the effect of algorithmic policies on segregation. As a result of executing ``python utility_thresholds.py`` you will find ``segregation_centralization.pdf``, ``segregation_atkinson.pdf``, ``segregation_ACI.pdf`` in the directory ``./effort_reward_fairness/results/StudentPerf/segregation_plots``. These are included in Fig 4 in the paper.
+
+5. Now to generate plots for models with Fairness Constraints, open experiment.py and change the global variable ``FAIRNESS_CONSTRAINTS`` to True. Save the file. Now repeat steps 1, 3 and 4 listed above (step 2 is optional, results of step 2 for models with Fairness Constraints are not included in the paper). **
+
+6. Once you have done step 5, you'll find ``segregation_centralization_fc.pdf``, ``segregation_atkinson_fc.pdf``, ``segregation_ACI_fc.pdf`` in the directory ``./effort_reward_fairness/results/StudentPerf/segregation_plots``. Thes plots are included in Fig 5 in the paper.
+
+
+== Preprocessing ==
+
+Code for preprocessing the Student Performance Dataset (http://archive.ics.uci.edu/ml/datasets/Student+Performance) can be found in ``./util/datasets/data/student_performance/preprocessing.ipynb``. An executable python file (``preprocessing.py``) of this notebook is also available in the same directory.
+
+
+** To run Fairness Constraints you need the file ``trained_linregfc_StudentPerf.mat`` in the ``./effort_reward_fairness`` directory. For easy reproduction of results we have included this file, however this is generated using MATLAB code in the directory ``./Fairness_constraints`` and can be found in the directory ``./Fairness_constraints/Output`` after you run ``./Fairness_constraints/Social_welfare_constrained_ERM_regularized.m``.