-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path3b.py
94 lines (67 loc) · 3.15 KB
/
3b.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import pandas as pd
# Import file
df_censored = pd.read_csv('censored_data.csv', sep=' ', header=None)
df_uncensored = pd.read_csv('uncensored_data.csv', sep=' ', header=None)
# Check which column that has missing values (nan)
nan_values = df_censored.isna()
columns_with_nan = nan_values.any()
list_columns_with_nan = df_censored.columns[columns_with_nan].tolist()
if len(list_columns_with_nan) == 1:
list_columns_with_nan = list_columns_with_nan[0]
n = df_censored.isnull().sum(axis = 0)
n = n[list_columns_with_nan]
print("There is {} NaN-values in column {}.".format(n, list_columns_with_nan))
# Find mean, median and max value of column with nan values
nan_column_mean = df_censored[list_columns_with_nan].mean()
nan_column_median = df_censored[list_columns_with_nan].median()
nan_column_max = df_censored[list_columns_with_nan].max()
# Show the different values of the column with nan
print("\nValues for column {}:\n Mean value: {:.3f}\n Median value: {:.3f}\n Max value: {:.3f}\n"
.format(list_columns_with_nan, nan_column_mean, nan_column_median, nan_column_max))
# Create new dataframes and fill missing values with mean, median and max values
df_mean_fill = df_censored
mean_fill_series = df_mean_fill[list_columns_with_nan].replace(np.nan, nan_column_mean)
df_median_fill = df_censored
median_fill_series = df_median_fill[list_columns_with_nan].replace(np.nan, nan_column_median)
df_max_fill = df_censored
max_fill_series = df_max_fill[list_columns_with_nan].replace(np.nan, nan_column_max)
# getting the true values
df_y = df_uncensored
non_nan_df = df_censored.dropna()
df_y = df_y.drop(non_nan_df.index)
y = df_y[1]
# Computing the Mean Square Error of the imputed values
y_mean = np.ones(len(y)) * nan_column_mean
y_median = np.ones(len(y)) * nan_column_median
y_max = np.ones(len(y)) * nan_column_max
MSE_mean = np.sum((y_mean - y)**2) / n
MSE_median = np.sum((y_median - y)**2) / n
MSE_max = np.sum((y_max - y)**2) / n
print("MSE for the missing values when replaced with the mean = {0:.2f}, median = {1:.2f} and max = {2:.2f}. "
.format(MSE_mean, MSE_median, MSE_max))
#
correlation_matrix = df_censored.corr()
print("\nCorrelation matrix for censored data:\n", correlation_matrix)
# Observations and training beta
df_train = df_censored.dropna()
X = np.c_[np.ones((df_train.shape[0],1)), df_train[0]]
y = df_train[1]
beta = np.matmul(np.linalg.inv(np.matmul(X.T, X)), np.matmul(X.T, y))
print("\nEstimated coefficients:\nb_0 = {:.3f} \nb_1 = {:.3f}".format(beta[0],
beta[1]))
# creating array of the rows with missing values
df_x = df_censored
df_x = df_x.drop(df_train.index)
# creating a vector of the values in column 0 where there is missing values in column 1
x = df_x[0]
# estimating missing parameters
y_pred = beta[0] + x*beta[1]
y_pred = y_pred.to_numpy()
# getting the true values
df_y = df_uncensored
df_y = df_y.drop(df_train.index)
y = df_y[0].to_numpy()
# Calculate MSE
MSE_reg = np.sum((y_pred - y)**2) / y.shape[0]
print("\nMSE for the missing values when replaced using regression on the first column = %.2f." % MSE_reg)