-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFairness Adult EN.py
132 lines (101 loc) · 4.58 KB
/
Fairness Adult EN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=columns, sep=',\s', na_values="?")
# Display basic data information
data.info()
# Prepare feature matrix X and target vector y
x = data.drop('income', axis=1)
y = data['income']
# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# List of numerical and categorical attributes
num_attribs = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_attribs = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
# Pipeline for numerical attributes
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
])
# Pipeline for categorical variables
cat_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="most_frequent")),
('onehot', OneHotEncoder(handle_unknown='ignore')),
])
# Full pipeline for processing both numerical and categorical attributes
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", cat_pipeline, cat_attribs),
])
# Transform training and testing data
x_train_prepared = full_pipeline.fit_transform(x_train)
x_test_prepared = full_pipeline.transform(x_test)
# Random Forest classifier instance
rf = RandomForestClassifier()
# Parameter distributions for Randomized Search
param_distributions = {
'n_estimators': np.arange(100, 501, 50),
'max_features': ['auto', 'sqrt'],
'max_depth': [None] + list(np.arange(10, 51, 10)),
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
# Perform randomized search
random_search.fit(x_train_prepared, y_train)
# Display best parameters and performance from Randomized Search
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)
# Grid Search parameters
param_grid = {
'n_estimators': [100, 200, 300], # Number of trees
'max_features': ['auto', 'sqrt', 'log2'], # Maximum number of features considered for splitting a node
'max_depth': [None, 10, 20, 30], # Maximum number of levels in each decision tree
'min_samples_split': [2, 5, 10], # Minimum number of data points placed in a node before the node is split
'min_samples_leaf': [1, 2, 4] # Minimum number of data points allowed in a leaf node
}
# GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)
# Perform grid search
grid_search.fit(x_train_prepared, y_train)
# Display best parameters and performance from Grid Search
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
# Random Forest classifiers configured from search results
rf_clf_randomsearch = RandomForestClassifier(
n_estimators=250,
max_depth=40,
min_samples_split=10,
min_samples_leaf=2,
max_features='auto',
random_state=42
)
rf_clf_gridsearch = RandomForestClassifier(
n_estimators=200,
max_depth=None,
min_samples_split=2,
min_samples_leaf=2,
max_features='auto',
random_state=42
)
# Fit models
rf_clf_randomsearch.fit(x_train_prepared, y_train)
rf_clf_gridsearch.fit(x_train_prepared, y_train)
# Predict on test data
y_pred_randomsearch = rf_clf_randomsearch.predict(x_test_prepared)
y_pred_gridsearch = rf_clf_gridsearch.predict(x_test_prepared)
# Evaluate and print the results
print("Accuracy Random Search:", accuracy_score(y_test, y_pred_randomsearch))
print("Classification Report Random Search:\n", classification_report(y_test, y_pred_randomsearch))
print("Accuracy Grid Search:", accuracy_score(y_test, y_pred_gridsearch))
print("Classification Report Grid Search:\n", classification_report(y_test, y_pred_gridsearch))