-
Notifications
You must be signed in to change notification settings - Fork 0
/
LASSO_bikesharing.Rmd
138 lines (96 loc) · 2.63 KB
/
LASSO_bikesharing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
---
title: "LASSO Bike Sharining "
author: "Charles Gallagher"
date: "8/30/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
# Clean Data
```{r}
# load packages
library(tidyverse)
library(tidymodels)
library(glmnet)
## Data Cleaning
BikeData <- read_csv("https://faculty.nps.edu/rbassett/Data/BikeData_Train.csv") # makes the correct datetime conversion
#names(BikeData)
#Make factors for these variables correct data
cats <- c("season", "yr","mnth" ,"holiday","weekday" ,"workingday", "weathersit") # make these factors everything else should be good one date the rest numeric
BikeData[cats] <-map(BikeData[cats], factor) #apply to dataframe
map_chr(BikeData, class)# check
#BikeData<-data.frame(BikeData) # make not a tibble odd things happen with RMSE function
glimpse(BikeData)
```
# Split Data
```{r}
# Split the data set training and test
set.seed(123)
split <- initial_split(BikeData, strata = cnt)
train <- training(split)
test <- testing(split)
```
# Preprocess Data
```{r}
rec <- recipe(cnt~., data = train) %>%
update_role(instant, new_role = "ID") %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_rm(dteday,casual, registered, atemp) %>%
#step_normalize(all_numeric(), - all_outcomes()) %>%
step_poly(temp,hum,windspeed, degree = 2) %>%
step_interact(term = ~temp_poly_1:hum_poly_1)
rec_prep<- prep(rec)
juiced <- juice(rec_prep)
```
# Build Model
```{r}
val_set <- bootstraps(train,strata = cnt)
tune_spec <-linear_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
wf <- workflow() %>% add_recipe(rec) %>% add_model(tune_spec)
lambda_grid <- grid_regular(penalty(range = c(.0001, 1)), levels = 50)
doParallel::registerDoParallel()
set.seed(1233)
lasso_grid <- tune_grid(
wf,
resamples = val_set,
grid = lambda_grid
)
lasso_grid %>% collect_metrics() %>%
filter(.metric == "rmse") %>%
ggplot(aes(penalty, mean))+
geom_point()+
geom_line()
best_rmse <-lasso_grid %>% select_best("rmse")
lasso_grid %>% collect_metrics() %>%
filter(.config==as.character(best_rmse[,2]))
lowest_rmse <- lasso_grid %>%
select_best("rmse", maximize = FALSE)
final_lasso <- finalize_workflow(
wf ,
lowest_rmse
)
```
```{r}
library(vip)
final_lasso %>%
fit(train) %>%
pull_workflow_fit() %>%
vi(lambda = lowest_rmse$penalty) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
```
```{r}
last_fit(
final_lasso,
split
) %>%
collect_metrics()
```