Chapter 2 Data Cleaning
2.2 ESM Data Setup
Next, we need to make sure that all time information for IPCS is available. Specifically, this will allow us to control for overnight periods and unequal spacing between measurement occasions.
2.2.1 Timing
First, we need to create empty rows in the data where there are missing assessments from the four target surveys per day as well as for the overnight periods. The function below uses the time stamp to figure out which blocks are missing and add those empty rows by indexing the time stamp of collected surveys as well as participants chosen start times.
<- function(d){
missing_fun <- unique(d$StartDate) # get first day
first_day <- unique(d$`Hour Block 1`) # get first hour block
hourBlock <- max(d$Day); max_day <- ifelse(max_day < 14, 14, max_day) # get number of days
max_day <- d %>% #mutate(StartDate = ifelse(is.na(StartDate), min(Date, na.), StartDate))
d2 full_join(crossing(
Day = seq(0,max_day,1),
HourBlock = 1:6,
StartDate = first_day,
`Hour Block 1` = hourBlock)) %>% # cross existing data with "perfect" data
arrange(Day, HourBlock) %>%
mutate(Date = StartDate + Day,
Hour = ifelse(is.na(Hour), `Hour Block 1` + (HourBlock-1)*4, Hour),
Minute = ifelse(is.na(Minute), "00", Minute))
$Date[d2$Hour > 23] <- d2$Date[d2$Hour > 23] + 1 # some day blocks span days
d2$Hour[d2$Hour > 23] <- d2$Hour[d2$Hour > 23] - 24 # some day blocks span days
d2<- d2 %>% mutate(
d2 Full_Date = sprintf("%s %s:%s", as.character(Date), Hour, Minute)) %>%
select(-`Hour Block 1`, -StartDate)
}
# create a data frame of timing info
<- emo %>% full_join(ds8) %>% full_join(bfi) %>%
ipcs_times select(SID, StartDate, Date, Hour, Minute, Day, `Hour Block 1`, HourBlock) %>%
distinct() %>%
mutate(Minute = str_remove_all(Minute, ".csv"),
Minute = ifelse(as.numeric(Minute) < 10, sprintf("0%s", Minute), Minute)) %>%
arrange(SID, Date) %>%
group_by(SID) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, missing_fun)) %>%
unnest(data) %>%
arrange(SID, Date, Hour) %>%
group_by(SID) %>%
mutate(all_beeps = seq(1, n(), 1)) %>%
ungroup()
Here’s the result
ipcs_times
## # A tibble: 23,615 × 8
## SID Date Hour Minute Day HourBlock Full_Date all_beeps
## <chr> <date> <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 01 2018-10-22 15 23 0 1 2018-10-22 15:23 1
## 2 01 2018-10-22 19 00 0 2 2018-10-22 19:00 2
## 3 01 2018-10-22 23 23 0 3 2018-10-22 23:23 3
## 4 01 2018-10-22 23 25 0 3 2018-10-22 23:25 4
## 5 01 2018-10-23 3 00 0 4 2018-10-23 3:00 5
## 6 01 2018-10-23 7 00 0 5 2018-10-23 7:00 6
## 7 01 2018-10-23 11 00 0 6 2018-10-23 11:00 7
## 8 01 2018-10-23 17 50 1 1 2018-10-23 17:50 8
## 9 01 2018-10-23 19 39 1 2 2018-10-23 19:39 9
## 10 01 2018-10-23 23 00 1 3 2018-10-23 23:00 10
## # … with 23,605 more rows
2.2.2 Personality
Now it’s time to wrangle the personality data. As a reminder, personality was assessed using the full BFI-2 (Soto & John, 2017). The scale was administered using a planned missing data design (Revelle et al., 2016). We have previously demonstrated both the between- and within-person construct validity of assessing personality using planned missing designs using the BFI-2 (https://osf.io/pj9sy/). The planned missingness was done within each Big Five trait separately, with three items from each trait included at each timepoint (75% missingness). Each item was answered relative to what a participant was just doing on a 5-point Likert-like scale from 1 “disagree strongly” to 5 “agree strongly.” Items for each person at each assessment were determined by pulling 3 numbers (1 to 12) from a uniform distribution. The order of the resulting 15 items were then randomized before being displayed to participants.
2.2.2.1 Wrangle Raw Data
# join with codebook, reverse code, composite within facets and spread to wide format
<- bfi %>%
bfi_wide select(SID, Date, Hour, Minute, trait, facet, value = responses2, reverse_code) %>%
mutate(Minute = str_remove_all(Minute, ".csv"),
Minute = ifelse(as.numeric(Minute) < 10, sprintf("0%s", Minute), Minute),
Full_Date = sprintf("%s %s:%s", as.character(Date), Hour, Minute)) %>%
mutate(value = as.numeric(value),
value = ifelse(!is.na(reverse_code) & reverse_code == "yes", reverse.code(-1, value, mini = 1, maxi = 5), value)) %>%
select(-reverse_code) %>%
group_by(SID, trait, facet, Full_Date) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider(names_from = c("trait", "facet")
values_from = "value"
, names_sep = "_") %>%
, group_by(SID) %>%
arrange(SID, lubridate::ymd_hm(Full_Date)) %>%
mutate(all_beeps = seq(1, n(), 1)) %>%
ungroup()
2.2.2.2 Multiple Imputation
These data were collected using a planned missing design, so we need to impute data for the planned missing components.
# run MI
<- data.frame(unclass(bfi_wide %>% select(-Full_Date)))
bfi_mi set.seed(5)
<- amelia(bfi_mi, m = 1, ts = "all_beeps", cs = "SID")$imputations[[1]] %>%
bfi_mi as_tibble() %>%
full_join(bfi_wide %>% select(SID, Full_Date, all_beeps)) %>%
select(-all_beeps)
## -- Imputation 1 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
bfi_mi
## # A tibble: 8,672 × 17
## SID agreeableness_C… agreeableness_R… agreeableness_T… conscientiousne… conscientiousne… conscientiousne… extraversion_As…
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 01 1.45 1 3 4 4 4.79 4
## 2 01 2.43 3 2 1 1 3.68 4
## 3 01 2.86 1.5 1.70 1.14 3.03 3 2
## 4 01 3.39 2 4 4.33 4 4 2
## 5 01 5 2 3.85 3.04 3 3.04 4.36
## 6 01 4 4.59 4 1.91 5 5 2.76
## 7 01 2 1 3.88 2.18 3 1.61 2.75
## 8 01 2 6.30 2 4 2.96 5 4
## 9 01 3.26 2.80 3 3.17 2 2 1.07
## 10 01 4.5 3.23 2.39 3.5 1.63 3.09 2
## # … with 8,662 more rows, and 9 more variables: extraversion_Energy.Level <dbl>, extraversion_Sociability <dbl>,
## # neuroticism_Anxiety <dbl>, neuroticism_Depression <dbl>, neuroticism_Emotional.Volatility <dbl>,
## # openness_Aesthetic.Sensitivity <dbl>, openness_Creative.Imagination <dbl>, openness_Intellectual.Curiosity <dbl>,
## # Full_Date <chr>
Personality data:
bfi_mi
## # A tibble: 8,672 × 17
## SID agreeableness_C… agreeableness_R… agreeableness_T… conscientiousne… conscientiousne… conscientiousne… extraversion_As…
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 01 1.45 1 3 4 4 4.79 4
## 2 01 2.43 3 2 1 1 3.68 4
## 3 01 2.86 1.5 1.70 1.14 3.03 3 2
## 4 01 3.39 2 4 4.33 4 4 2
## 5 01 5 2 3.85 3.04 3 3.04 4.36
## 6 01 4 4.59 4 1.91 5 5 2.76
## 7 01 2 1 3.88 2.18 3 1.61 2.75
## 8 01 2 6.30 2 4 2.96 5 4
## 9 01 3.26 2.80 3 3.17 2 2 1.07
## 10 01 4.5 3.23 2.39 3.5 1.63 3.09 2
## # … with 8,662 more rows, and 9 more variables: extraversion_Energy.Level <dbl>, extraversion_Sociability <dbl>,
## # neuroticism_Anxiety <dbl>, neuroticism_Depression <dbl>, neuroticism_Emotional.Volatility <dbl>,
## # openness_Aesthetic.Sensitivity <dbl>, openness_Creative.Imagination <dbl>, openness_Intellectual.Curiosity <dbl>,
## # Full_Date <chr>
Now back to long format
<- bfi_mi %>%
bfi_long pivot_longer(cols = c(-SID, -Full_Date)
names_to = c("trait", "facet")
, values_to = "value"
, names_sep = "_") %>%
, mutate(category = "BFI-2")
2.2.3 Other Measured Features
Emotion and Situation (Binary and DIAMONDS) Data (not planned missing, so no need to impute):
<- emo %>%
features full_join(sit) %>%
full_join(ds8) %>%
select(SID, category, trait, facet, itemname, Date, Hour, Minute, Day, HourBlock, value = responses2) %>%
mutate(Minute = str_remove_all(Minute, ".csv"),
Minute = ifelse(as.numeric(Minute) < 10, sprintf("0%s", Minute), Minute),
Full_Date = sprintf("%s %s:%s", as.character(Date), Hour, Minute),
value = as.numeric(value)) %>%
group_by(SID, category, trait, facet, Full_Date) %>%
summarize(value = max(value)) %>%
ungroup()
features
## # A tibble: 312,010 × 6
## SID category trait facet Full_Date value
## <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 01 Affect afraid afraid 2018-10-22 15:23 2
## 2 01 Affect afraid afraid 2018-10-22 23:23 3
## 3 01 Affect afraid afraid 2018-10-22 23:25 3
## 4 01 Affect afraid afraid 2018-10-23 17:50 4
## 5 01 Affect afraid afraid 2018-10-23 19:39 2
## 6 01 Affect afraid afraid 2018-10-24 0:00 2
## 7 01 Affect afraid afraid 2018-10-24 11:44 3
## 8 01 Affect afraid afraid 2018-10-24 15:37 2
## 9 01 Affect afraid afraid 2018-10-24 20:46 3
## 10 01 Affect afraid afraid 2018-10-25 21:07 4
## # … with 312,000 more rows
2.2.4 Timing Features
Finally, we’ll create the timing features. These were created from the time stamps collected with each survey based on approaches used in other studies of idiographic prediction (e.g., Fisher & Soyster, 2019). To create these, we created time of day (4; morning, midday, evening, night) and day of the week dummy codes (7). Next, we create a cumulative time variable (in hours) from first beep (not used in analyses) that we used to create linear, quadratic, and cubic time trends (3) as well as 1 and 2 period sine and cosine functions across each 24 period (e.g., 2 period sine = {cumulative time}_t and 1 period sine = {cumulative time}_t).
<- ipcs_times %>%
time_features mutate(wkday = wday(Full_Date, label = T)
Mon = ifelse(wkday == "Mon", 1, 0)
, Tue = ifelse(wkday == "Tue", 1, 0)
, Wed = ifelse(wkday == "Wed", 1, 0)
, Thu = ifelse(wkday == "Thu", 1, 0)
, Fri = ifelse(wkday == "Fri", 1, 0)
, Sat = ifelse(wkday == "Sat", 1, 0)
, Sun = ifelse(wkday == "Sun", 1, 0)
, morning = ifelse(Hour >= 5 & Hour < 11, 1, 0)
, midday = ifelse(Hour >= 11 & Hour < 17, 1, 0)
, evening = ifelse(Hour >= 5 & Hour < 22, 1, 0)
, night = ifelse(Hour >= 22 & Hour < 5, 1, 0)) %>%
,
## sequential time differences for each persn
group_by(SID) %>%
mutate(tdif = as.numeric(difftime(ymd_hm(Full_Date), lag(ymd_hm(Full_Date)), units = "hours"))) %>%
filter(is.na(tdif) | tdif > 1) %>%
mutate(tdif = as.numeric(difftime(ymd_hm(Full_Date), lag(ymd_hm(Full_Date)), units = "hours"))
tdif = ifelse(is.na(tdif), 0, tdif)
, cumsumT = cumsum(tdif)) %>%
, ungroup() %>%
## timing variables
mutate(linear = as.numeric(scale(cumsumT))
quad = linear^2
, cub = linear^3
, sin1p = sin(((2*pi)/24)*cumsumT)
, sin2p = sin(((2*pi)/12)*cumsumT)
, cos1p = cos(((2*pi)/24)*cumsumT)
, cos2p = cos(((2*pi)/12)*cumsumT)
, %>%
)
## keep key variables and reshape
select(SID, Full_Date, Mon:night, linear:cos2p) %>%
pivot_longer(cols = c(-SID, -Full_Date)
names_to = "trait"
, values_to = "value") %>%
, mutate(category = "time"
facet = trait) ,
time_features
## # A tibble: 413,928 × 6
## SID Full_Date trait value category facet
## <chr> <chr> <chr> <dbl> <chr> <chr>
## 1 01 2018-10-22 15:23 Mon 1 time Mon
## 2 01 2018-10-22 15:23 Tue 0 time Tue
## 3 01 2018-10-22 15:23 Wed 0 time Wed
## 4 01 2018-10-22 15:23 Thu 0 time Thu
## 5 01 2018-10-22 15:23 Fri 0 time Fri
## 6 01 2018-10-22 15:23 Sat 0 time Sat
## 7 01 2018-10-22 15:23 Sun 0 time Sun
## 8 01 2018-10-22 15:23 morning 0 time morning
## 9 01 2018-10-22 15:23 midday 1 time midday
## 10 01 2018-10-22 15:23 evening 1 time evening
## # … with 413,918 more rows
2.2.5 Combine Features
Now, let’s bring the personality, affect/situation/DIAMONDS, and timing features together.
<- bfi_long %>%
all_features full_join(features) %>%
full_join(ipcs_times %>% select(SID, Full_Date)) %>%
full_join(time_features) %>%
arrange(SID, category, trait, facet, Full_Date)
all_features
## # A tibble: 871,011 × 6
## SID Full_Date trait facet value category
## <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 01 2018-10-22 15:23 afraid afraid 2 Affect
## 2 01 2018-10-22 23:23 afraid afraid 3 Affect
## 3 01 2018-10-22 23:25 afraid afraid 3 Affect
## 4 01 2018-10-23 17:50 afraid afraid 4 Affect
## 5 01 2018-10-23 19:39 afraid afraid 2 Affect
## 6 01 2018-10-24 0:00 afraid afraid 2 Affect
## 7 01 2018-10-24 11:44 afraid afraid 3 Affect
## 8 01 2018-10-24 15:37 afraid afraid 2 Affect
## 9 01 2018-10-24 20:46 afraid afraid 3 Affect
## 10 01 2018-10-25 21:07 afraid afraid 4 Affect
## # … with 871,001 more rows
2.3 Setup for Idiographic Machine Learning Models
The last step is the most important. We need to: Separate the data for each outcome, participant, and feature set combination. In addition, the outcomes need to be lagged such that same time point features will be predicting “future” behavior. Moreover, the data must be split into training (first 75%) and test sets (last 25%). As we do this, we will also remove participants who have no variance in the outcome in either training or test sets as we can’t (statistically) predict things without variance (even if no variance suggests a good prediction!).
The feature sets are as follows:
- Psychological: Big Five (BFI-2)
- Psychological: Affect
- Psychological: Big Five + Affect
- Situations: Binary
- Situations: DIAMONDS
- Situations: Binary + DIAMONDS
- Full: Big Five + Affect + Binary + DIAMONDS
Each of these will be tested with and without the timing features for a total number of 14 feature sets.
For now, I’m not going to run the chunk below because it takes a long time. All the resulting files can be found in the online materials:
- 04-data/02-raw-data: data before being split into training and test
- 04-data/03-train-data: training data for each participant x outcome x feature set combination (14)
- 04-data/04-test-data: test data for each participant x outcome x feature set combination (14)
<- function(d, group, set, outcome, SID, time){
save_fun print(paste(SID, outcome, group, set, time))
<- initial_time_split(d, prop = 0.75)
d_split <- training(d_split)
d_train <- testing(d_split)
d_test if(any(table(d_train$o_value) < 2) | sd(d_train$o_value) == 0) {
return(NA) # no variance == can't use that participant
else {
} <- d_train %>%
d_train mutate_at(vars(one_of(dummy_vars)), train_fun) %>%
mutate_at(vars(one_of(c(dummy_vars, time_dummy))), factor)
<- F # this is indexing if there were any other issues or concerns to be aware of
ret
}if(length(unique(d_test$o_value[!is.na(d_test$o_value)])) == 1){
<- d_test %>%
d_test mutate_at(vars(one_of(dummy_vars)), test_fun) %>%
mutate_at(vars(one_of(c(dummy_vars, time_dummy))), factor)
<- c(ret, T)
ret else {
} <- d_test %>%
d_test mutate_at(vars(one_of(c(dummy_vars, time_dummy))), factor)
<- c(ret, F)
ret
}<- d_train %>% full_join(d_test) %>% arrange(Full_Date)
d <- initial_time_split(d, prop = 0.75)
d_split <- training(d_split)
d_train <- testing(d_split)
d_test save(d, file = sprintf("%s/04-data/02-model-data/%s_%s_%s_%s_%s.RData"
, res_path, SID, outcome, group, set, time))save(d_train, file = sprintf("%s/04-data/03-train-data/%s_%s_%s_%s_%s.RData"
, res_path, SID, outcome, group, set, time))save(d_test, d_split, file = sprintf("%s/04-data/04-test-data/%s_%s_%s_%s_%s.RData"
, res_path, SID, outcome, group, set, time))# return(T)
if(any(ret == T)) ret <- T else ret <- F
return(ret)
# }
}
<- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2} else{F}}
factor_fun
<- c("o_value", "argument", "interacted", "lostSmthng"
dummy_vars "late", "frgtSmthng", "brdSWk", "excSWk", "AnxSWk"
, "tired", "sick", "sleeping", "class"
, "music", "internet", "TV", "study", "prcrst", "lonely")
, <- c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"
time_dummy "morning", "midday", "evening", "night")
,
<- function(group, set, outcome, time){
data_fun if(set != "all") groups <- set
if(set == "all") {
if(group == "psychological") groups <- c("BFI-2", "Affect")
else if(group == "situations") groups <- c("S8-I", "sit")
else groups <- c("BFI-2", "Affect", "S8-I", "sit")
}if(time == "time") groups <- c(groups, "time")
<- all_features %>%
out filter(trait == outcome) %>%
select(SID, Full_Date, value) %>%
group_by(SID) %>%
mutate(o_value = lead(value)) %>%
ungroup() %>%
select(-value)
<- all_features %>%
d filter(category %in% groups & trait != outcome) %>%
mutate(name = ifelse(category == "BFI-2", paste(trait, facet, sep = "_"), trait)) %>%
select(SID, Full_Date, name, value) %>%
distinct() %>%
pivot_wider(names_from = "name"
values_from = "value") %>%
, full_join(out) %>%
filter(complete.cases(.))
%>%
d group_by(SID) %>%
filter(n() >= 40) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(data, group, set, outcome, SID, time), possibly(save_fun, NA_real_))) %>%
unnest(data)
}
<- tribble(
nested_data ~group, ~set,
"psychological", "BFI-2" ,
"psychological", "Affect",
"psychological", "all" ,
"situations" , "sit" ,
"situations" , "S8-I" ,
"situations" , "all" ,
"full" , "all" ,
%>%
) full_join(crossing(group = c("psychological", "situations", "full")
time = c("no time", "time")
, outcome = c("prcrst", "lonely", "sick", "tired", "argument", "interacted", "study"))) %>%
, filter(time == "time") %>%
mutate(data = pmap(list(group, set, outcome, time), possibly(data_fun, NA_real_)))
2.4 Demographics
load(url(sprintf("%s/04-data/01-raw-data/cleaned_combined_2020-05-06.RData", res_path)))
<- baseline %>%
dem select(SID:race) %>%
mutate(age = year(ymd_hms(StartDate)) - as.numeric(YOB),
StartDate = as.Date(ymd_hms(StartDate)),
race = factor(race, 0:3, c("White", "Black", "Asian", "Other"))) %>%
select(-YOB)
<- all_features %>%
prelim_dem filter(category %in% c("Affect", "BFI-2", "sit", "SI-8")) %>%
group_by(SID, Full_Date, trait, facet, category) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider(names_from = c("category", "trait", "facet")
values_from = value) %>%
, filter(complete.cases(.))
%>%
prelim_dem group_by(SID) %>%
tally() %>%
ungroup() %>%
left_join(dem) %>%
summarize(N = length(unique(SID)),
n = sprintf("%.2f (%.2f; %i-%i", mean(n), sd(n), min(n), max(n)),
gender = sprintf("%i (%.2f%%)",sum(gender == "Female", na.rm = T), sum(gender == "Female", na.rm = T)/n()*100),
age = sprintf("%.2f (%.2f)", mean(age, na.rm = T), sd(age, na.rm = T)),
white = sprintf("%i (%.2f%%)"
sum(race == "White", na.rm = T)
, sum(race == "White", na.rm = T)/n()*100),
, black = sprintf("%i (%.2f%%)"
sum(race == "Black", na.rm = T)
, sum(race == "Black", na.rm = T)/n()*100),
, asian = sprintf("%i (%.2f%%)"
sum(race == "Asian", na.rm = T)
, sum(race == "Asian", na.rm = T)/n()*100),
, other = sprintf("%i (%.2f%%)"
sum(race == "Other", na.rm = T)
, sum(race == "Other", na.rm = T)/n()*100),
, StartDate = sprintf("%s (%s - %s)", median(StartDate),
min(StartDate), max(StartDate)))
## # A tibble: 1 × 9
## N n gender age white black asian other StartDate
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 199 42.23 (24.01; 1-158 144 (70.59%) 19.52 (1.24) 65 (31.86%) 30 (14.71%) 60 (29.41%) 28 (13.73%) NA (NA - NA)
<- prelim_dem %>%
final_dem group_by(SID) %>%
filter(n() >= 40) %>%
tally() %>%
ungroup() %>%
left_join(dem)
unique(ldply(str_split(list.files(sprintf("%s/05-results/01-glmnet/01-tuning-models", res_path)), pattern = "_"), function(x) x[1]))$V1
## NULL
%>%
final_dem filter(SID %in% unique(ldply(str_split(list.files(sprintf("%s/05-results/01-glmnet/01-tuning-models", local_path)), pattern = "_"), function(x) x[1]))$V1) %>%
summarize(N = length(unique(SID)),
n = sprintf("%.2f (%.2f; %i-%i", mean(n), sd(n), min(n), max(n)),
gender = sprintf("%i (%.2f%%)",sum(gender == "Female", na.rm = T), sum(gender == "Female", na.rm = T)/n()*100),
age = sprintf("%.2f (%.2f)", mean(age, na.rm = T), sd(age, na.rm = T)),
white = sprintf("%i (%.2f%%)"
sum(race == "White", na.rm = T)
, sum(race == "White", na.rm = T)/n()*100),
, black = sprintf("%i (%.2f%%)"
sum(race == "Black", na.rm = T)
, sum(race == "Black", na.rm = T)/n()*100),
, asian = sprintf("%i (%.2f%%)"
sum(race == "Asian", na.rm = T)
, sum(race == "Asian", na.rm = T)/n()*100),
, other = sprintf("%i (%.2f%%)"
sum(race == "Other", na.rm = T)
, sum(race == "Other", na.rm = T)/n()*100),
, StartDate = sprintf("%s (%s - %s)", median(StartDate),
min(StartDate), max(StartDate)))
## # A tibble: 1 × 9
## N n gender age white black asian other StartDate
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 113 57.70 (18.26; 40-158 85 (72.65%) 19.57 (1.29) 36 (30.77%) 17 (14.53%) 37 (31.62%) 18 (15.38%) NA (NA - NA)
rm(list = ls()[!ls() %in% c("codebook", "ipcs_codebook", "res_path", "local_path", "sheets", "outcomes", "ftrs")])