-
Notifications
You must be signed in to change notification settings - Fork 0
/
HighDimensionalDataApplication.R
133 lines (106 loc) · 5.55 KB
/
HighDimensionalDataApplication.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
library("neuralnet")
library("caret")
library("glmnet")
library("ncpen")
library("abess")
library("flare")
mutual_fund <- read_csv("mutual fund.csv")
# The function of prediction error
Prediction_Error <- function(y,yhat){mean((y-yhat)^2)}
set.seed(2500)
#Load fund data
fund_y = as.data.frame(mutual_fund[,1])
fund_x = as.data.frame(mutual_fund[,-1])
# Split data into train and test sets
n <- nrow(fund_y) # Number of observations in the dataset
train_size <- round(0.8 * n) # Number of observations for the training set
train_ind <- sample(1:n, train_size) # Randomly select indices for the training set
test_ind <- setdiff(1:n, train_ind) # Get the remaining indices for the test set
y_train <- fund_y$y[train_ind]
X_train <- fund_x[train_ind, ]
X_train_mean <- apply(X_train, 2, mean)
X_train_scale <- scale(X_train, X_train_mean, F)
y_train_mean <- mean(y_train)
y_train_scale <- scale(y_train, y_train_mean, F)
train_scale_df <- data.frame(y_train_scale, X_train_scale)
y_test <- fund_y$y[test_ind]
X_test <- fund_x[test_ind, ]
X_test_scale <- scale(X_test, X_train_mean, F)
y_test_scale <- scale(y_test, y_train_mean, F)
colnames(train_scale_df)[-1] <- paste("X", 1:(ncol(train_scale_df)-1), sep = "")
colnames(X_train_scale) <- paste0("X", 1:ncol(X_train_scale))
colnames(X_test_scale) <- paste0("X", 1:ncol(X_test_scale))
p=ncol(X_train_scale)
###Use ENET
# Convert y_train to a numeric vector (train function wants it)
y_train_scale_numeric <- as.numeric(y_train_scale)
#ENET using Cross Validation
#Find the optimal parameters using validation data
optimal_param_enet <- train(X_train_scale, y_train_scale_numeric,
method= "glmnet",
tuneGrid=expand.grid(alpha=seq(0.1,0.9,length=10),
lambda = seq(0.001, 0.02, length=20)),
metric = "RMSE",
trControl=trainControl(method = "cv", number = 10))
# Get the lambda and alpha combination that minimizes RMSE
best_lambda_enet <- optimal_param_enet$bestTune$lambda
best_alpha_enet <- optimal_param_enet$bestTune$alpha
# Re-fit the model with the selected lambda and alpha
mod_enet <- glmnet(X_train_scale, y_train_scale, alpha = best_alpha_enet, lambda = best_lambda_enet)
b.enet = as.vector(coef(mod_enet))[-1]
###Use ALASSO
#LASSO using Cross Validation
# Fit LASSO model using the training and validation data and select tuning parameters
lasso_cv <- cv.glmnet(X_train_scale, y_train_scale, type.measure = "mse", alpha = 1, nfolds = 10)
# Find the optimal lambda value chosen by cross-validation
optimal_lambda_lasso <- lasso_cv$lambda.min
# Fit the LASSO model using the optimal lambda and the full training data
mod_lasso <- glmnet(X_train_scale, y_train_scale, alpha = 1, lambda = optimal_lambda_lasso)
b.lasso = as.vector(coef(mod_lasso))[-1]
#Adaptive LASSO with 10-fold CV
weight <- b.lasso
weight <- ifelse(weight == 0, 0.00001, weight)
alasso_cv <- cv.glmnet(X_train_scale, y_train_scale, type.measure = "mse", alpha = 1, nfolds = 10,
penalty.factor = 1 / abs(weight), keep = TRUE)
optimal_lambda_alasso = alasso_cv$lambda.min
best_alasso_coef1 <- coef(alasso_cv, s = alasso_cv$lambda.min)
b.alasso = as.vector(best_alasso_coef1)[-1]
# Formula of the enet model
# Get the column names of your predictor variables
predictor_names <- colnames(X_train_scale)
# Get the indices of the non-zero coefficients
non_zero_indices_FM <- which(b.enet != 0)
# Get the corresponding predictor names for non-zero coefficients
selected_predictors_FM <- predictor_names[non_zero_indices_FM]
xcount.FM <- c(0,paste(selected_predictors_FM, sep=""))
Formula_enet <- as.formula(paste("y_train_scale~", paste(xcount.FM, collapse= "+")))
#Formula of the alasso model
# Get the indices of the non-zero coefficients
non_zero_indices_SM <- which(b.alasso != 0)
# Get the corresponding predictor names for non-zero coefficients
selected_predictors_SM <- predictor_names[non_zero_indices_SM]
xcount.SM <- c(0,paste(selected_predictors_SM, sep=""))
Formula_alasso <- as.formula(paste("y_train_scale~", paste(xcount.SM, collapse= "+")))
###Neural Network
#formula of the model without variable selection
xcount <- c(0,paste("X", 1:p, sep=""))
Formula_full <- as.formula(paste("y_train_scale~", paste(xcount, collapse= "+")))
#formula of the model with enet (already found in the previous chunk)
#formula of the model with alasso (already found in the previous chunk)
#fitting the neural networks
nn_mutualfunds_1 = neuralnet(Formula_full, data=train_scale_df, hidden=1, linear.output = TRUE)
nn_mutualfunds_2 = neuralnet(Formula_enet, data=train_scale_df, hidden=1, linear.output = TRUE)
nn_mutualfunds_3 = neuralnet(Formula_alasso, data=train_scale_df, hidden=1, linear.output = TRUE)
# Find the column indices of selected predictor variables for each model
indices_selected_enet <- which(colnames(X_test_scale) %in% selected_predictors_FM)
indices_selected_alasso <- which(colnames(X_test_scale) %in% selected_predictors_SM)
# Create data frames for input data with selected predictors
X_test_selected_enet <- X_test_scale[, indices_selected_enet]
X_test_selected_alasso <- X_test_scale[, indices_selected_alasso]
#prediction using test set
y_hat_full = compute(nn_mutualfunds_1,X_test_scale)$net.result
y_hat_enet = compute(nn_mutualfunds_2,X_test_selected_enet)$net.result
y_hat_alasso = compute(nn_mutualfunds_3,X_test_selected_alasso)$net.result
pred_error_full = Prediction_Error(y_test_scale, y_hat_full)
pred_error_enet = Prediction_Error(y_test_scale, y_hat_enet)
pred_error_alasso = Prediction_Error(y_test_scale, y_hat_alasso)