Chapter 5 Random Forest
Random forest models are a variant of decision tree classification algorithms that additionally draw on bagging (i.e. ensemble methods; bootstrapping with aggregation) methods. The name itself hints at how it different from classic decision trees; we have a forest instead of a tree. In so doing, random forest models draw upon a large number of decision trees of varying depth that are then aggregated. The random part comes from two sources. First, each tree in the forest in trained on a data set drawn with replacement from the training dataset (i.e. bootstrapped) as part of the bagging procedure. Observations left out of each model are termed out-of-bag observations (and collectively, the out-of-bag dataset). These are used to evaluate the performance of the tree. Second, the features used in each tree are also randomly drawn from the full of features. Each of these trees, based on their data generates a prediction given new data. The final prediction is based off the ensemble of trees – that is, the decision made by a majority of the trees (i.e. aggregation). Importantly, because of the random feature selection part of the procedure, random forest can also provide estimates of variable importance, indicating which features are critical to making a less error-prone classification.
Because random forest using bagging (i.e. bootstrapping with aggregation), we will have to perform a series of steps that make bootstrapping appropriate with time series data: differencing, Box-Cox transformations, and time-delay embedding. Essentially, differencing and Box-Cox transformations stabilize the mean and variance, respectively, to make the time series stationary (Priestley, 1988), and time-delay embedding quite literally embeds the sequence of the time series into predictor variables, which in effect preserves the order of the times series (Von Oertzen & Boker, 2010). We can easily back transform forecasts to their original scale.
In the present study, we used the tidymodels
package in R to estimate the random forest models by calling the rand_forest()
, setting the engine as “ranger”
, with importance = “permutation”
in order to extract variable importance, and the mode as “classification”
. The parameters tuned via rolling origin forecast validation were mtry
(i.e. the number of predictors that will be randomly sampled at each split when creating tree models) and min_n
(i.e. the minimum number of data points in a node that is required for the node to be split further), which were each set to 10 values. Next, we used the select_best()
function with the method
set to “accuracy”
to allow the algorithm to automatically pick the best combination of mtry
and min_n
that maximized classification accuracy. Next, we fit the final training model using the full training set and the best combination of mtry
and min_n
and tested the model using the training set. To evaluate the efficacy of the model, we extracted the classification accuracy rate (0-1) and the AUC using the collect_metrics()
function.
5.1 Set Up Data
5.1.1 Differencing and Box Cox
<- c("o_value", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"
dummy_vars "morning", "midday", "evening", "night", "argument"
, "interacted", "lostSmthng", "late", "frgtSmthng", "brdSWk"
, "excSWk", "AnxSWk", "tired", "sick", "sleeping", "class"
, "music", "internet", "TV", "study")
,
<- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"
time_vars "morning", "midday", "evening", "night"
, "sin2p", "sin1p", "cos2p", "cos1p"
, "cub", "linear", "quad")
,
<- function(sid, outcome, group, set, time){
rf_fun load(sprintf("%s/04-data/02-model-data/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))# differencing and box-cox
<- d %>%
d mutate_if(is.factor, ~as.numeric(as.character(.))) %>%
mutate_at(vars(-one_of(c(dummy_vars, time_vars)), -Full_Date), log) %>%
mutate_at(vars(-one_of(c(dummy_vars, time_vars)), -Full_Date), ~. - lag(.))
# time delay embedding
<- map(d %>% select(-Full_Date, -one_of(time_vars)), ~embed(., 2)) %>% ldply(.) %>%
d_mbd group_by(.id) %>%
mutate(beep = 1:n()) %>%
ungroup() %>%
pivot_wider(names_from = ".id"
names_glue = "{.id}_{.value}"
, values_from = c("1", "2")) %>%
, bind_cols(d[-1,] %>% select(Full_Date)) %>%
select(-beep)
<- d_mbd %>%
d_mbd full_join(d %>% select(Full_Date, one_of(time_vars))) %>%
mutate_at(vars(contains(dummy_vars)), factor) %>%
# mutate(o_value_1 = factor(o_value_1)) %>%
drop_na()
# training and test sets
<- initial_time_split(d_mbd, prop = 0.75)
d_split <- training(d_split)
d_train <- testing(d_split)
d_test
<- d_train %>% arrange(lubridate::ymd_hm(Full_Date)) %>% select(-Full_Date)
d_train
## create the rolling_origin training and validation sets
<- ceiling(nrow(d_train)/3)
init
# set up the cross-valiation folds
<- rolling_origin(
d_train_cv
d_train, initial = init,
assess = 5,
skip = 1,
cumulative = TRUE
)
# set up the data and formula
<- recipe(
mod_recipe ~ .
o_value_1 data = d_train
, %>%
) step_zv(all_numeric(), contains(dummy_vars), contains(time_vars)) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_nzv(all_predictors(), unique_cut = 35) #%>%
# estimate the means and standard deviations
# prep(training = d_train, retain = TRUE)
# set up the model specifications
<-
tune_spec rand_forest(
mtry = tune()
trees = 1000
, min_n = tune()
, %>%
) set_engine("ranger", importance = "permutation") %>%
set_mode("classification")
# set up the workflow: combine modeling spec with modeling recipe
set.seed(345)
<- workflow() %>%
rf_wf add_model(tune_spec) %>%
add_recipe(mod_recipe)
# set up the ranges for the tuning functions
set.seed(345)
<- tune_grid(
tune_res
rf_wfresamples = d_train_cv
, grid = 20
,
)save(tune_res, file = sprintf("%s/05-results/03-rf/01-tuning-models/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
# load(sprintf("%s/05-results/03-rf/01-tuning-models/%s_%s_%s_%s_%s.RData",
# res_path, sid, outcome, group, set, time))
# plot the metrics across tuning parameters
<- tune_res %>%
p collect_metrics() %>%
ggplot(aes(mtry, mean, color = min_n)) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_gradient(low = "gray90", high = "red") +
theme_classic()
ggsave(p, file = sprintf("%s/05-results/03-rf/02-tuning-figures/%s_%s_%s_%s_%s.png",
res_path, sid, outcome, group, set, time)width = 5, height = 8)
,
# select the best model based on AUC
<- tune_res %>%
best_rf # select_best("roc_auc")
select_best("accuracy")
# set up the workflow for the best model
<-
final_wf %>%
rf_wf finalize_workflow(best_rf)
# run the final best model on the training data and save
<-
final_rf %>%
final_wf fit(data = d_train)
<- final_rf %>%
final_m pull_workflow_fit()
<- final_m$fit$variable.importance
final_coefs
<- best_rf %>%
best_rf mutate(nvars = length(final_coefs[final_coefs != 0]))
save(final_coefs, best_rf,
file = sprintf("%s/05-results/03-rf/07-final-model-param/%s_%s_%s_%s_%s.RData",
res_path, sid, outcome, group, set, time))
# run the final fit workflow of the training and test data together
<-
final_fit %>%
final_wf last_fit(d_split)
save(final_rf, final_fit
file = sprintf("%s/05-results/03-rf/03-final-training-models/%s_%s_%s_%s_%s.RData",
,
res_path, sid, outcome, group, set, time))
# final metrics (accuracy and roc)
<- final_fit %>%
final_metrics collect_metrics(summarize = T)
save(final_metrics
file = sprintf("%s/05-results/03-rf/06-final-model-performance/%s_%s_%s_%s_%s.RData",
,
res_path, sid, outcome, group, set, time))
# variable importance
<- final_rf %>%
final_var_imp pull_workflow_fit() %>%
vi() %>%
slice_max(Importance, n = 10)
save(final_var_imp
file = sprintf("%s/05-results/03-rf/05-variable-importance/%s_%s_%s_%s_%s.RData",
,
res_path, sid, outcome, group, set, time))
# roc plot
<- final_fit %>%
p_roc collect_predictions() %>%
roc_curve(.pred_0, truth = o_value_1) %>%
autoplot() +
labs(title = sprintf("Participant %s: %s, %s, %s, %s"
, sid, outcome, group, set, time)) ggsave(p_roc, file = sprintf("%s/05-results/03-rf/04-roc-curves/%s_%s_%s_%s_%s.png",
res_path, sid, outcome, group, set, time)width = 5, height = 5)
,
rm(list = c("final_var_imp", "final_metrics", "final_wf", "final_rf", "final_fit"
"best_rf", "tune_res", "rf_wf", "tune_spec", "mod_recipe"
, "p", "p_roc", "d_split", "d_test", "d_train", "d_train_cv"))
, gc()
return(T)
}
5.1.2 Run Models
plan(multisession(workers = 12L))
<- tibble(
rf_res file = sprintf("%s/04-data/02-model-data", res_path) %>% list.files()
%>%
) separate(file, c("SID", "outcome", "group", "set", "time"), sep = "_") %>%
mutate(time = str_remove_all(time, ".RData")) %>%
mutate(
mod = future_pmap(
list(SID, outcome, group, set, time)
safely(rf_fun, NA_real_)
, .progress = T
, .options = future_options(
, globals = c("res_path", "dummy_vars", "time_vars")
packages = c("plyr", "tidyverse", "glmnet", "tidymodels", "vip")
,
)
)
) closeAllConnections()