Fairness Adult EN.py


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 
           'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
data = pd.read_csv(url, names=columns, sep=',\s', na_values="?")

# Display basic data information
data.info()

# Prepare feature matrix X and target vector y
x = data.drop('income', axis=1)
y = data['income']

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# List of numerical and categorical attributes
num_attribs = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_attribs = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

# Pipeline for numerical attributes
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
])

# Pipeline for categorical variables
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

# Full pipeline for processing both numerical and categorical attributes
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs),
])

# Transform training and testing data
x_train_prepared = full_pipeline.fit_transform(x_train)
x_test_prepared = full_pipeline.transform(x_test)

# Random Forest classifier instance
rf = RandomForestClassifier()

# Parameter distributions for Randomized Search
param_distributions = {
    'n_estimators': np.arange(100, 501, 50),
    'max_features': ['auto', 'sqrt'],
    'max_depth': [None] + list(np.arange(10, 51, 10)),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Perform randomized search
random_search.fit(x_train_prepared, y_train)

# Display best parameters and performance from Randomized Search
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

# Grid Search parameters
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_features': ['auto', 'sqrt', 'log2'],  # Maximum number of features considered for splitting a node
    'max_depth': [None, 10, 20, 30],  # Maximum number of levels in each decision tree
    'min_samples_split': [2, 5, 10],  # Minimum number of data points placed in a node before the node is split
    'min_samples_leaf': [1, 2, 4]  # Minimum number of data points allowed in a leaf node
}

# GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(x_train_prepared, y_train)

# Display best parameters and performance from Grid Search
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

# Random Forest classifiers configured from search results
rf_clf_randomsearch = RandomForestClassifier(
    n_estimators=250,
    max_depth=40,
    min_samples_split=10,
    min_samples_leaf=2,
    max_features='auto',
    random_state=42
)

rf_clf_gridsearch = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features='auto',
    random_state=42
)

# Fit models
rf_clf_randomsearch.fit(x_train_prepared, y_train)
rf_clf_gridsearch.fit(x_train_prepared, y_train)

# Predict on test data
y_pred_randomsearch = rf_clf_randomsearch.predict(x_test_prepared)
y_pred_gridsearch = rf_clf_gridsearch.predict(x_test_prepared)

# Evaluate and print the results
print("Accuracy Random Search:", accuracy_score(y_test, y_pred_randomsearch))
print("Classification Report Random Search:\n", classification_report(y_test, y_pred_randomsearch))

print("Accuracy Grid Search:", accuracy_score(y_test, y_pred_gridsearch))
print("Classification Report Grid Search:\n", classification_report(y_test, y_pred_gridsearch))