Chapter 3 Elastic Net
Elastic Net Regression proceeds from the observation that typical OLS-based regression minimizes bias but may have great variance. Using L1 (Ridge) and L2 (LASSO) approaches, which apply penalties to model estimates, elastic net attempts to balance the trade-off between bias and variance by choosing the best penalties that minimize an information criterion or prediction error. Together, these both shrink coefficients and help with feature selection by forcing some of the coefficients to be zero. Because there are a large number of values the regularization parameter \(\lambda\) can take on, the typical solution is to use a method like k-fold cross-validation to test a number of \(\lambda\) values and choose the one with the one that matches a criterion like minimizing prediction error.
In the present study, we will use the cva.glmnet() function from the glmnetUtils package with 10 folds, and use mean absolute error (mae) to tune the hyperparameters.
3.1 Functions
dummy_vars <- c("o_value", "Mon", "Tues", "Wed", "Thu", "Fri", "Sat", "Sun"
, "morning", "midday", "evening", "night", "argument"
, "interacted", "lostSmthng", "late", "frgtSmthng", "brdSWk"
, "excSWk", "AnxSWk", "tired", "sick", "sleeping", "class"
, "music", "internet", "TV", "study")
c_fun <- function(m){
# final model characteristics
lambda <- min(m$fit$lambda)
coefs <- stats::coef(m$fit, s = lambda)
coefs <- coefs[, 1L, drop = TRUE]
coefs <- coefs[setdiff(x = names(coefs), y = "(Intercept)")]
fixedLassoInf(x,y,beta,lambda,sigma=sigma)
return(coefs)
}
elnet_fun <- function(sid, outcome, group, set, time){
# load the data
load(sprintf("%s/04-data/03-train-data/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
d_train <- d_train %>% arrange(Full_Date) %>% select(-Full_Date)
d_train_cv <- rolling_origin(
d_train,
initial = 15,
assess = 1,
cumulative = TRUE
)
# set up the cross-valiation folds
# set.seed(234)
# d_train_cv <- vfold_cv(d_train, v = 10)
# set up the data and formula
mod_recipe <- recipe(
o_value ~ .
, data = d_train
) %>%
step_dummy(one_of(dummy_vars), -all_outcomes()) %>%
step_zv(all_numeric()) %>%
step_normalize(all_numeric()) %>%
# estimate the means and standard deviations
prep(training = d_train, retain = TRUE)
# set up the model specifications
tune_spec <-
logistic_reg(
penalty = tune()
, mixture = tune()
) %>%
set_engine("glmnet") %>%
set_mode("classification")
# set up the ranges for the tuning functions
elnet_grid <- grid_regular(penalty()
, mixture()
, levels = 10)
# set up the workflow: combine modeling spec with modeling recipe
set.seed(345)
elnet_wf <- workflow() %>%
add_model(tune_spec) %>%
add_recipe(mod_recipe)
# combine the workflow, and grid to a final tuning model
elnet_res <-
elnet_wf %>%
tune_grid(
resamples = d_train_cv
, grid = elnet_grid
, control = control_resamples(save_pred = T)
)
save(elnet_res, file = sprintf("%s/05-results/01-glmnet/01-tuning-models/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
# plot the metrics across tuning parameters
p <- elnet_res %>%
collect_metrics() %>%
ggplot(aes(penalty, mean, color = mixture)) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_gradient(low = "gray90", high = "red") +
theme_classic()
ggsave(p, file = sprintf("%s/05-results/01-glmnet/02-tuning-figures/%s_%s_%s_%s_%s.png",
res_path, sid, outcome, group, set, time)
, width = 5, height = 8)
# select the best model based on AUC
best_elnet <- elnet_res %>%
# select_best("roc_auc")
select_best("accuracy")
# set up the workflow for the best model
final_wf <-
elnet_wf %>%
finalize_workflow(best_elnet)
# run the final best model on the training data and save
final_elnet <-
final_wf %>%
fit(data = d_train)
# load the split data
load(sprintf("%s/04-data/04-test-data/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
# d_split$data$o_value <- factor(d_split$data$o_value)
x <- apply(d_split$data %>% select(-Full_Date, -o_value) %>% as.matrix(), 2, as.numeric)
x <- cbind(rep(1, times = nrow(x)), x)
y <- d_split$data$o_value %>% as.character() %>% as.numeric()
# run the final fit workflow of the training and test data together
final_fit <-
final_wf %>%
last_fit(d_split)
save(final_elnet, final_fit
, file = sprintf("%s/05-results/01-glmnet/03-final-training-models/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
# final metrics (accuracy and roc)
final_metrics <- final_fit %>%
collect_metrics(summarize = T)
save(final_metrics
, file = sprintf("%s/05-results/01-glmnet/06-final-model-performance/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
final_m <- final_elnet %>%
pull_workflow_fit()
lambda <- min(final_m$fit$lambda)#/nrow(x)
coefs <- stats::coef(final_m$fit, s = lambda)#, x = x, y = y, exact = TRUE)
final_coefs <- coefs[, 1L, drop = TRUE]
final_coefs <- final_coefs[setdiff(x = names(final_coefs), y = "(Intercept)")]
# sigma <- selectiveInference::estimateSigma(x, y, intercept=TRUE, standardize=TRUE)$sigmahat
# selectiveInference::fixedLassoInf(x[,-1],y, coefs, lambda/nrow(x), sigma=sigma, family="binomial",alpha=0.05)
best_elnet <- best_elnet %>%
mutate(nvars = length(final_coefs[final_coefs != 0]))
save(final_coefs, best_elnet,
file = sprintf("05-results/01-glmnet/07-final-model-param/%s_%s_%s_%s_%s.RData",
sid, outcome, group, set, time))
# variable importance
final_var_imp <- final_elnet %>%
pull_workflow_fit() %>%
vi() %>%
slice_max(Importance, n = 10)
save(final_var_imp
, file = sprintf("%s/05-results/01-glmnet/05-variable-importance/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
# roc plot
p_roc <- final_fit %>%
collect_predictions() %>%
roc_curve(.pred_0, truth = o_value) %>%
autoplot() +
labs(title = sprintf("Participant %s: %s, %s, %s, %s"
, sid, outcome, group, set, time))
ggsave(p_roc, file = sprintf("%s/05-results/01-glmnet/04-roc-curves/%s_%s_%s_%s_%s.png",
res_path, sid, outcome, group, set, time)
, width = 5, height = 5)
rm(list = c("final_var_imp", "final_metrics", "final_wf", "final_elnet", "final_fit"
, "best_elnet", "elnet_res", "elnet_wf", "elnet_grid", "tune_spec", "mod_recipe"
, "p", "p_roc", "d_split", "d_test", "d_train", "d_train_cv"))
gc()
return(T)
}3.2 Run Models
plan(multisession(workers = 12L))
tibble(
file = sprintf("%s/04-data/02-model-data", res_path) %>% list.files()
) %>%
separate(file, c("SID", "outcome", "group", "set", "time"), sep = "_") %>%
mutate(time = str_remove_all(time, ".RData")
, mod = future_pmap(
list(SID, outcome, group, set, time)
, possibly(elnet_fun, NA_real_)
, .progress = T
, .options = future_options(
globals = c("res_path", "dummy_vars")
, packages = c("plyr", "tidyverse", "glmnet", "tidymodels", "vip")
)
)
)
closeAllConnections()