Chapter 3 Elastic Net
Elastic Net Regression proceeds from the observation that typical OLS-based regression minimizes bias but may have great variance. Using L1 (Ridge) and L2 (LASSO) approaches, which apply penalties to model estimates, elastic net attempts to balance the trade-off between bias and variance by choosing the best penalties that minimize an information criterion or prediction error. Together, these both shrink coefficients and help with feature selection by forcing some of the coefficients to be zero. Because there are a large number of values the regularization parameter \(\lambda\) can take on, the typical solution is to use a method like k-fold cross-validation to test a number of \(\lambda\) values and choose the one with the one that matches a criterion like minimizing prediction error.
In the present study, we will use the cva.glmnet()
function from the glmnetUtils
package with 10 folds, and use mean absolute error (mae) to tune the hyperparameters.
3.1 Functions
<- c("o_value", "Mon", "Tues", "Wed", "Thu", "Fri", "Sat", "Sun"
dummy_vars "morning", "midday", "evening", "night", "argument"
, "interacted", "lostSmthng", "late", "frgtSmthng", "brdSWk"
, "excSWk", "AnxSWk", "tired", "sick", "sleeping", "class"
, "music", "internet", "TV", "study")
,
<- function(m){
c_fun # final model characteristics
<- min(m$fit$lambda)
lambda <- stats::coef(m$fit, s = lambda)
coefs <- coefs[, 1L, drop = TRUE]
coefs <- coefs[setdiff(x = names(coefs), y = "(Intercept)")]
coefs fixedLassoInf(x,y,beta,lambda,sigma=sigma)
return(coefs)
}
<- function(sid, outcome, group, set, time){
elnet_fun # load the data
load(sprintf("%s/04-data/03-train-data/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))<- d_train %>% arrange(Full_Date) %>% select(-Full_Date)
d_train
<- rolling_origin(
d_train_cv
d_train, initial = 15,
assess = 1,
cumulative = TRUE
)
# set up the cross-valiation folds
# set.seed(234)
# d_train_cv <- vfold_cv(d_train, v = 10)
# set up the data and formula
<- recipe(
mod_recipe ~ .
o_value data = d_train
, %>%
) step_dummy(one_of(dummy_vars), -all_outcomes()) %>%
step_zv(all_numeric()) %>%
step_normalize(all_numeric()) %>%
# estimate the means and standard deviations
prep(training = d_train, retain = TRUE)
# set up the model specifications
<-
tune_spec logistic_reg(
penalty = tune()
mixture = tune()
, %>%
) set_engine("glmnet") %>%
set_mode("classification")
# set up the ranges for the tuning functions
<- grid_regular(penalty()
elnet_grid mixture()
, levels = 10)
, # set up the workflow: combine modeling spec with modeling recipe
set.seed(345)
<- workflow() %>%
elnet_wf add_model(tune_spec) %>%
add_recipe(mod_recipe)
# combine the workflow, and grid to a final tuning model
<-
elnet_res %>%
elnet_wf tune_grid(
resamples = d_train_cv
grid = elnet_grid
, control = control_resamples(save_pred = T)
,
)save(elnet_res, file = sprintf("%s/05-results/01-glmnet/01-tuning-models/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
# plot the metrics across tuning parameters
<- elnet_res %>%
p collect_metrics() %>%
ggplot(aes(penalty, mean, color = mixture)) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_gradient(low = "gray90", high = "red") +
theme_classic()
ggsave(p, file = sprintf("%s/05-results/01-glmnet/02-tuning-figures/%s_%s_%s_%s_%s.png",
res_path, sid, outcome, group, set, time)width = 5, height = 8)
,
# select the best model based on AUC
<- elnet_res %>%
best_elnet # select_best("roc_auc")
select_best("accuracy")
# set up the workflow for the best model
<-
final_wf %>%
elnet_wf finalize_workflow(best_elnet)
# run the final best model on the training data and save
<-
final_elnet %>%
final_wf fit(data = d_train)
# load the split data
load(sprintf("%s/04-data/04-test-data/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))# d_split$data$o_value <- factor(d_split$data$o_value)
<- apply(d_split$data %>% select(-Full_Date, -o_value) %>% as.matrix(), 2, as.numeric)
x <- cbind(rep(1, times = nrow(x)), x)
x <- d_split$data$o_value %>% as.character() %>% as.numeric()
y
# run the final fit workflow of the training and test data together
<-
final_fit %>%
final_wf last_fit(d_split)
save(final_elnet, final_fit
file = sprintf("%s/05-results/01-glmnet/03-final-training-models/%s_%s_%s_%s_%s.RData",
,
res_path, sid, outcome, group, set, time))
# final metrics (accuracy and roc)
<- final_fit %>%
final_metrics collect_metrics(summarize = T)
save(final_metrics
file = sprintf("%s/05-results/01-glmnet/06-final-model-performance/%s_%s_%s_%s_%s.RData",
,
res_path, sid, outcome, group, set, time))
<- final_elnet %>%
final_m pull_workflow_fit()
<- min(final_m$fit$lambda)#/nrow(x)
lambda <- stats::coef(final_m$fit, s = lambda)#, x = x, y = y, exact = TRUE)
coefs <- coefs[, 1L, drop = TRUE]
final_coefs <- final_coefs[setdiff(x = names(final_coefs), y = "(Intercept)")]
final_coefs # sigma <- selectiveInference::estimateSigma(x, y, intercept=TRUE, standardize=TRUE)$sigmahat
# selectiveInference::fixedLassoInf(x[,-1],y, coefs, lambda/nrow(x), sigma=sigma, family="binomial",alpha=0.05)
<- best_elnet %>%
best_elnet mutate(nvars = length(final_coefs[final_coefs != 0]))
save(final_coefs, best_elnet,
file = sprintf("05-results/01-glmnet/07-final-model-param/%s_%s_%s_%s_%s.RData",
sid, outcome, group, set, time))
# variable importance
<- final_elnet %>%
final_var_imp pull_workflow_fit() %>%
vi() %>%
slice_max(Importance, n = 10)
save(final_var_imp
file = sprintf("%s/05-results/01-glmnet/05-variable-importance/%s_%s_%s_%s_%s.RData",
,
res_path, sid, outcome, group, set, time))
# roc plot
<- final_fit %>%
p_roc collect_predictions() %>%
roc_curve(.pred_0, truth = o_value) %>%
autoplot() +
labs(title = sprintf("Participant %s: %s, %s, %s, %s"
, sid, outcome, group, set, time)) ggsave(p_roc, file = sprintf("%s/05-results/01-glmnet/04-roc-curves/%s_%s_%s_%s_%s.png",
res_path, sid, outcome, group, set, time)width = 5, height = 5)
,
rm(list = c("final_var_imp", "final_metrics", "final_wf", "final_elnet", "final_fit"
"best_elnet", "elnet_res", "elnet_wf", "elnet_grid", "tune_spec", "mod_recipe"
, "p", "p_roc", "d_split", "d_test", "d_train", "d_train_cv"))
, gc()
return(T)
}
3.2 Run Models
plan(multisession(workers = 12L))
tibble(
file = sprintf("%s/04-data/02-model-data", res_path) %>% list.files()
%>%
) separate(file, c("SID", "outcome", "group", "set", "time"), sep = "_") %>%
mutate(time = str_remove_all(time, ".RData")
mod = future_pmap(
, list(SID, outcome, group, set, time)
possibly(elnet_fun, NA_real_)
, .progress = T
, .options = future_options(
, globals = c("res_path", "dummy_vars")
packages = c("plyr", "tidyverse", "glmnet", "tidymodels", "vip")
,
)
)
) closeAllConnections()