Chapter 3 Elastic Net

Elastic Net Regression proceeds from the observation that typical OLS-based regression minimizes bias but may have great variance. Using L1 (Ridge) and L2 (LASSO) approaches, which apply penalties to model estimates, elastic net attempts to balance the trade-off between bias and variance by choosing the best penalties that minimize an information criterion or prediction error. Together, these both shrink coefficients and help with feature selection by forcing some of the coefficients to be zero. Because there are a large number of values the regularization parameter \(\lambda\) can take on, the typical solution is to use a method like k-fold cross-validation to test a number of \(\lambda\) values and choose the one with the one that matches a criterion like minimizing prediction error.

In the present study, we will use the cva.glmnet() function from the glmnetUtils package with 10 folds, and use mean absolute error (mae) to tune the hyperparameters.

3.1 Functions

dummy_vars <- c("o_value", "Mon", "Tues", "Wed", "Thu", "Fri", "Sat", "Sun"
                , "morning", "midday", "evening", "night", "argument"
                , "interacted", "lostSmthng", "late", "frgtSmthng", "brdSWk"
                , "excSWk", "AnxSWk", "tired", "sick", "sleeping", "class"
                , "music", "internet", "TV", "study")

c_fun <- function(m){
  # final model characteristics
  lambda <- min(m$fit$lambda)
  coefs <- stats::coef(m$fit, s = lambda)
  coefs <- coefs[, 1L, drop = TRUE]
  coefs <- coefs[setdiff(x = names(coefs), y = "(Intercept)")]
  fixedLassoInf(x,y,beta,lambda,sigma=sigma)
  return(coefs)
}

elnet_fun <- function(sid, outcome, group, set, time){
  # load the data
  load(sprintf("%s/04-data/03-train-data/%s_%s_%s_%s_%s.RData",
               res_path, sid, outcome, group, set, time))
  d_train <- d_train %>% arrange(Full_Date) %>% select(-Full_Date)
  
  d_train_cv <- rolling_origin(
    d_train, 
    initial = 15, 
    assess = 1,
    cumulative = TRUE
  )
  
  # set up the cross-valiation folds
  # set.seed(234)
  # d_train_cv <- vfold_cv(d_train, v = 10)
  
  # set up the data and formula
  mod_recipe <- recipe(
    o_value ~ .
    , data = d_train
    ) %>%
    step_dummy(one_of(dummy_vars), -all_outcomes()) %>%
    step_zv(all_numeric()) %>%
    step_normalize(all_numeric()) %>%
    # estimate the means and standard deviations
    prep(training = d_train, retain = TRUE)
  
  # set up the model specifications 
  tune_spec <- 
    logistic_reg(
      penalty = tune()
      , mixture = tune()
    ) %>% 
    set_engine("glmnet") %>% 
    set_mode("classification")
  
  # set up the ranges for the tuning functions 
  elnet_grid <- grid_regular(penalty()
                             , mixture()
                             , levels = 10)
  # set up the workflow: combine modeling spec with modeling recipe
  set.seed(345)
  elnet_wf <- workflow() %>%
    add_model(tune_spec) %>%
    add_recipe(mod_recipe)
  
  # combine the workflow, and grid to a final tuning model
  elnet_res <- 
    elnet_wf %>% 
    tune_grid(
      resamples = d_train_cv
      , grid = elnet_grid
      , control = control_resamples(save_pred = T)
      )
  save(elnet_res, file = sprintf("%s/05-results/01-glmnet/01-tuning-models/%s_%s_%s_%s_%s.RData",
               res_path, sid, outcome, group, set, time))
  
  # plot the metrics across tuning parameters
  p <- elnet_res %>%
    collect_metrics() %>%
      ggplot(aes(penalty, mean, color = mixture)) +
      geom_point(size = 2) +
      facet_wrap(~ .metric, scales = "free", nrow = 2) +
      scale_x_log10(labels = scales::label_number()) +
      scale_color_gradient(low = "gray90", high = "red") +
      theme_classic()
  ggsave(p, file = sprintf("%s/05-results/01-glmnet/02-tuning-figures/%s_%s_%s_%s_%s.png",
               res_path, sid, outcome, group, set, time)
         , width = 5, height = 8)
  
  # select the best model based on AUC
  best_elnet <- elnet_res %>%
    # select_best("roc_auc")
    select_best("accuracy")
  
  # set up the workflow for the best model
  final_wf <- 
    elnet_wf %>% 
    finalize_workflow(best_elnet)
  
  # run the final best model on the training data and save
  final_elnet <- 
    final_wf %>%
    fit(data = d_train) 
  
  # load the split data
  load(sprintf("%s/04-data/04-test-data/%s_%s_%s_%s_%s.RData",
               res_path, sid, outcome, group, set, time))
  # d_split$data$o_value <- factor(d_split$data$o_value)
  x <- apply(d_split$data %>% select(-Full_Date, -o_value) %>% as.matrix(), 2, as.numeric)
  x <- cbind(rep(1, times = nrow(x)), x)
  y <- d_split$data$o_value %>% as.character() %>% as.numeric()
  
  # run the final fit workflow of the training and test data together
  final_fit <- 
    final_wf %>%
    last_fit(d_split) 
  save(final_elnet, final_fit
       , file = sprintf("%s/05-results/01-glmnet/03-final-training-models/%s_%s_%s_%s_%s.RData",
                        res_path, sid, outcome, group, set, time))
  
  # final metrics (accuracy and roc)
  final_metrics <- final_fit %>%
    collect_metrics(summarize = T)
  save(final_metrics
       , file = sprintf("%s/05-results/01-glmnet/06-final-model-performance/%s_%s_%s_%s_%s.RData",
                        res_path, sid, outcome, group, set, time))
  
  final_m <- final_elnet %>% 
    pull_workflow_fit() 
  lambda <-  min(final_m$fit$lambda)#/nrow(x)
  coefs <- stats::coef(final_m$fit, s = lambda)#, x = x, y = y, exact = TRUE)
  final_coefs <- coefs[, 1L, drop = TRUE]
  final_coefs <- final_coefs[setdiff(x = names(final_coefs), y = "(Intercept)")]
  # sigma <- selectiveInference::estimateSigma(x, y, intercept=TRUE, standardize=TRUE)$sigmahat
  # selectiveInference::fixedLassoInf(x[,-1],y, coefs, lambda/nrow(x), sigma=sigma, family="binomial",alpha=0.05)
  
  best_elnet <- best_elnet %>%
    mutate(nvars = length(final_coefs[final_coefs != 0]))
  
  save(final_coefs, best_elnet,
       file = sprintf("05-results/01-glmnet/07-final-model-param/%s_%s_%s_%s_%s.RData",
                      sid, outcome, group, set, time))
  
  # variable importance
  final_var_imp <- final_elnet %>% 
    pull_workflow_fit() %>% 
    vi() %>%
    slice_max(Importance, n = 10)
  save(final_var_imp
       , file = sprintf("%s/05-results/01-glmnet/05-variable-importance/%s_%s_%s_%s_%s.RData",
                        res_path, sid, outcome, group, set, time))
  
  # roc plot
  p_roc <- final_fit %>%
    collect_predictions() %>% 
    roc_curve(.pred_0, truth = o_value) %>% 
    autoplot() + 
    labs(title = sprintf("Participant %s: %s, %s, %s, %s"
                         , sid, outcome, group, set, time)) 
  ggsave(p_roc, file = sprintf("%s/05-results/01-glmnet/04-roc-curves/%s_%s_%s_%s_%s.png",
               res_path, sid, outcome, group, set, time)
         , width = 5, height = 5)
  
  rm(list = c("final_var_imp", "final_metrics", "final_wf", "final_elnet", "final_fit"
              , "best_elnet", "elnet_res", "elnet_wf", "elnet_grid", "tune_spec", "mod_recipe"
              , "p", "p_roc", "d_split", "d_test", "d_train", "d_train_cv"))
  gc()
  return(T)
}

3.2 Run Models

plan(multisession(workers = 12L))
tibble(
  file = sprintf("%s/04-data/02-model-data", res_path) %>% list.files() 
) %>%
  separate(file, c("SID", "outcome", "group", "set", "time"), sep = "_") %>%
  mutate(time = str_remove_all(time, ".RData")
         , mod = future_pmap(
           list(SID, outcome, group, set, time)
           , possibly(elnet_fun, NA_real_)
           , .progress = T
           , .options = future_options(
             globals = c("res_path", "dummy_vars")
             , packages = c("plyr", "tidyverse", "glmnet", "tidymodels", "vip")
           )
           )
         ) 
closeAllConnections()