Chapter 2 Data Cleaning
2.1 Add Health
The National Study of Adolescent to Adult Health (Ad Health; Harris & Udry, 2018) is an ongoing longitudinal study of adolescents in the United States that began as a response to a federal mandate to better understand adolescent health. The data are available online at https://www.icpsr.umich.edu/icpsrweb/DSDR/studies/21600.
The initial sample of participants included approximately 20,000 students who completed at home administrations the study. Four waves of data collection (1994-1995, 1996, 2001-2002, and 2008) have been completed. The latest release contains data through 2008. Another wave of collection began in 2016 but has not yet been released. More documentation of the data are available at https://www.icpsr.umich.edu/icpsrweb/content/DSDR/add-health-data-guide.html#intro.
Sample sizes vary by year, from 14,738 (1996) to 20,745 (1994-1995). This provides 99% power to detect a correlation effect size of ~.03.
2.1.1 Load Data
addhealth_read_fun <- function(x){
sprintf("%s/data/addhealth/%s", wd, x) %>% haven::read_sav(.) %>% select(one_of(old.names)) %>% haven::zap_labels(.)
}
addhealth_codebook <- (codebook %>% filter(study == "Add Health"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_upper(orig_itemname))
addhealth_codebook
## # A tibble: 533 x 16
## study dataset category name itemname wave year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 addhe… SCH match adopt… adopted SCH 1994 addhealth__m… S25 Adopted ".\tmissing… <NA> ifelse(… NA NA skip
## 2 addhe… SCH match age age SCH 1994 addhealth__m… S1 Age ".\tmissing… <NA> ifelse(… NA NA skip
## 3 addhe… SCH match alcoh… alc12mo SCH 1994 <NA> S59B Drank Alcoh… ".\tmissing… <NA> ifelse(… NA NA max
## 4 addhe… SCH match alcoh… alcohol SCH 1994 <NA> S49 Have Had Al… ".\tmissing… <NA> ifelse(… NA NA max
## 5 addhe… SCH match alcoh… drunk12mo SCH 1994 <NA> S59C Got Drunk i… ".\tmissing… <NA> ifelse(… NA NA max
## 6 addhe… SCH match bioPa… livingBi… SCH 1994 <NA> S26 Live with B… ".\tmissing… <NA> ifelse(… NA NA skip
## 7 addhe… SCH match birth… birthpla… SCH 1994 <NA> S8 Born in Uni… ".\tmissing… <NA> ifelse(… NA NA max
## 8 addhe… Parent match birth… borninUS Pare… 1995 <NA> PC64 Child was b… ".\tmissing… <NA> ifelse(… NA NA max
## 9 addhe… Parent match birth… bornUS Pare… 1995 <NA> PA3 Born in Uni… ".\tmissing… <NA> ifelse(… NA NA max
## 10 addhe… Parent match brthw… birthwei… Pare… 1995 <NA> PC19A_P Child's bir… ".\tmissing… <NA> ifelse(… NA NA sum
## # … with 523 more rows
old.names <- unique(addhealth_codebook$orig_itemname)
datasets <- sprintf("%s/data/addhealth", wd) %>% list.files(., pattern = ".sav")
addhealth <- tibble(datasets = datasets) %>%
mutate(data = map(datasets, addhealth_read_fun),
ncol = map(data, ncol)) %>%
unnest(ncol)
addhealth_long <- addhealth %>%
filter(ncol > 1) %>%
mutate(data = map(data, ~(.) %>% gather(key = orig_itemname, value = value, -AID))) %>%
select(-ncol, -datasets) %>%
unnest(data) %>%
rename(SID = AID)
save(addhealth, file = sprintf("%s/data/clean/addhealth_raw.RData", wd))
2.1.2 Matching Variables
# rename variables
# rename variables
addhealth_match <- addhealth_codebook %>%
filter(category == "match") %>%
select(name, itemname, wave, year, orig_itemname, reverse_code:comp_rule) %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(addhealth_long))) %>%
unnest()
yrBrth <- addhealth_match %>%
filter(name == "yearBrth") %>%
mutate(yearBrth = value + 1900) %>%
select(SID, yearBrth)
# recode
recode_fun <- function(rule, y, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
addhealth_match <- addhealth_match %>%
filter(name != "age") %>%
left_join(yrBrth) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
# reverse code
addhealth_match <- addhealth_match %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, yearBrth, name, year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
addhealth_waves <- p_waves %>% filter(Study == "Add Health") %>% select(Used) %>% distinct()
addhealth_match <- addhealth_match %>%
filter(year <= max(addhealth_waves$Used)) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
comp_rule = ifelse(comp_rule == "mx", "max", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
addhealth_match %>%
filter((year <= 1996 | year <= p_year) & comp_rule == rule) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
distinct()
}
addhealth_match <- crossing(
p_year = addhealth_waves$Used,
comp_rule = unique(addhealth_match$comp_rule)
) %>%
mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
distinct() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
spread(name, value)
addhealth_match
## # A tibble: 19,515 x 120
## p_year SID physhlthevnt A C DEP E IQ LOC N `NA` PA SE SWL adopted ageMarried alcohol bioParinHH birthplace
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1995 5710… 0 3.5 3.5 0.5 NA 28 2 NA 0.5 0 5 NA 0 NA 1 NA 1
## 2 1996 5710… 0 NA NA NA NA NA NA NA NA NA NA NA 0 NA 1 NA 1
## 3 2001 5710… NA NA NA 1.75 NA NA NA NA 2 NA 4.75 3 0 NA 1 NA 1
## 4 1995 5710… 0 3 2.75 0.611 NA 59.5 2 NA 0.5 0 3.83 NA 0 NA 1 NA 1
## 5 1996 5710… 0 NA NA 0.167 4 NA 3.5 3 0 0 5 NA 0 NA 1 NA 1
## 6 2001 5710… NA NA NA 1.25 NA NA NA NA 1 NA 3.5 4 0 NA 1 NA 1
## 7 1995 5710… 0 4.5 3 0.0556 NA 79 3 NA 0 0 4.83 NA 0 NA 0 NA 1
## 8 1996 5710… 0 NA NA NA NA NA NA NA NA NA NA NA 0 NA 0 NA 1
## 9 2001 5710… 0 NA NA NA NA NA NA NA NA NA NA NA 0 NA 0 NA 1
## 10 1995 5710… 0 4.5 2 1.17 NA 57 5 NA 0.5 1 4.67 NA NA NA 0 NA 1
## # … with 19,505 more rows, and 101 more variables: brthweight <dbl>, childFrnd <dbl>, childHlthProb <dbl>, childTemper <dbl>, childTrustwrthy <dbl>,
## # chldAgeHlthProb <dbl>, chldHlthIns <dbl>, chldHlthProb <dbl>, clubs <dbl>, dadAlc <dbl>, dadBornUS <dbl>, dadCare <dbl>, dadEdu <dbl>,
## # dadEmployed <dbl>, dadHealthProb <dbl>, dadInHH <dbl>, date <dbl>, dentist <dbl>, disability <dbl>, drugs <dbl>, drVisits <dbl>,
## # durBreastFeed <dbl>, employed <dbl>, everLiveBioDad <dbl>, everLiveBioMom <dbl>, exercise <dbl>, expAIDS <dbl>, expDie25 <dbl>,
## # expGradCollege <dbl>, expInc <dbl>, expLive35 <dbl>, expMarried25 <dbl>, extracurricular <dbl>, friendship <dbl>, frndsGoodInf <dbl>, gender <dbl>,
## # grade <dbl>, grades <dbl>, grsWages <dbl>, height <dbl>, HHsize <dbl>, hlthProb <dbl>, hlthSymp <dbl>, involvedSchWrk <dbl>, liveMom <dbl>,
## # momAlc <dbl>, momAlive <dbl>, momBornUS <dbl>, momCare <dbl>, momEdu <dbl>, momEmployed <dbl>, momHealthProb <dbl>, momInHH <dbl>,
## # moneyForBills <dbl>, nbhdQual <dbl>, numMarriages <dbl>, numSiblng <dbl>, ParDisabled <dbl>, parDivorce <dbl>, parExpGrad <dbl>,
## # parMarriedAge <dbl>, parReligFreq <dbl>, parReligImp <dbl>, parReligion <dbl>, parReligScript <dbl>, parRelSat <dbl>, parRepHlth <dbl>,
## # parSmokes <dbl>, parTalkSex <dbl>, ParTalkSex <dbl>, parUnemployBen <dbl>, physFunc <dbl>, PhysFunc <fct>, prntAlcohol <dbl>, prntWelfare <dbl>,
## # race <dbl>, recoverFast <dbl>, relChld <dbl>, religFreq <dbl>, religImp <dbl>, religion <dbl>, religScript <dbl>, sameHouse <dbl>, satHH <dbl>,
## # satSchl <dbl>, school <dbl>, skipClass <dbl>, smokes <dbl>, sports <dbl>, SRhealth <dbl>, twin <dbl>, Understandchild <dbl>, weapon <dbl>,
## # weed <dbl>, weight <dbl>, yearBrth <dbl>, yearMoveUS <dbl>, yearsNoDad <dbl>, yearsNoMom <dbl>, age <dbl>, …
2.1.3 Personality Variables
addhealth_pers <- addhealth_codebook %>%
filter(category == "pers") %>%
select(name, itemname, wave, year, orig_itemname, reverse_code:comp_rule) %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(addhealth_long))) %>%
unnest(data) %>%
distinct() %>%
filter(year %in% c(addhealth_waves$Used))
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
addhealth_pers <- addhealth_pers %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
addhealth_pers <- addhealth_pers %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
addhealth_alpha <- addhealth_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(value = mean))),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
# create composites
addhealth_pers <- addhealth_pers %>%
group_by(SID, name, year) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup()
## # A tibble: 105,398 x 4
## SID name year value
## <chr> <chr> <dbl> <dbl>
## 1 57100270 A 1995 3.5
## 2 57100270 C 1995 3.5
## 3 57100270 DEP 1995 0.5
## 4 57100270 DEP 2001 1.75
## 5 57100270 IQ 1995 28
## 6 57100270 LOC 1995 2
## 7 57100270 NA 1995 0.5
## 8 57100270 NA 2001 2
## 9 57100270 PA 1995 0
## 10 57100270 SE 1995 5
## # … with 105,388 more rows
2.1.4 Outcome Variables
# missing all edu 2008 variables
addhealth_out <- addhealth_codebook %>%
filter(category == "out") %>%
select(name, itemname, wave, year, orig_itemname, reverse_code:comp_rule) %>%
left_join(addhealth_long) %>%
distinct() %>%
full_join(crossing(p_year = addhealth_waves$Used, name = unique((.)$name)))
# recode
recode_fun <- function(rule, y, p_year, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
addhealth_out <- addhealth_out %>%
left_join(yrBrth) %>%
select(-orig_itemname) %>%
group_by(recode, year, p_year) %>%
nest() %>%
ungroup() %>%
mutate(year = as.numeric(year),
data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
unnest(data) %>%
distinct()
# composite within years
# compositing within years
addhealth_out <- addhealth_out %>%
group_by(SID, name, year, p_year) %>%
summarize(value = max(value)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))
addhealth_out <- addhealth_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(SID, name, group, p_year) %>%
summarize(value = max(value)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA)))) %>%
ungroup() %>%
mutate(value = ifelse(name == "crim" & is.na(value), 0, value))
## # A tibble: 198,117 x 4
## SID name p_year value
## <chr> <chr> <dbl> <dbl>
## 1 57100270 chldbrth 1995 0
## 2 57100270 chldbrth 1996 0
## 3 57100270 chldbrth 2001 0
## 4 57100270 crim 1995 0
## 5 57100270 crim 1996 0
## 6 57100270 crim 2001 0
## 7 57100270 divorced 1995 0
## 8 57100270 divorced 1996 0
## 9 57100270 divorced 2001 0
## 10 57100270 edu 1995 0
## # … with 198,107 more rows
2.1.5 Covariates
addhealth_match <- addhealth_pers %>%
spread(name, value) %>%
rename(p_year = year) %>%
full_join(addhealth_match)
addhealth_match <- addhealth_match %>%
mutate(age = p_year - yearBrth,
height = height*2.54/100,
weight = weight/2.2/2.2,
BMI = weight / (height)^2,
BMI = ifelse(is.infinite(BMI), NA, BMI))
addhealth_match <- addhealth_out %>%
filter(name == "physhlthevnt") %>%
select(p_year, SID, physhlthevnt = past) %>%
full_join(addhealth_match) %>%
mutate_at(vars(drugs, weed), ~ifelse(is.na(.), 0, .))
# select(-yearBrth, -contains("dad"), -contains("mom"), -religion,
# -gender, -parDivorce, -numSiblng, -religion, -race, -SRhealth) %>%
# full_join(addhealth_dem)
addhealth_out <- addhealth_out %>%
select(-past, -future) %>%
distinct()
unique(specifications$name)[!unique(specifications$name) %in% colnames(addhealth_match2)]
addhealth_SCA <- addhealth_match %>%
select(SID, p_year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
mutate_at(vars(parEdu), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu)
## # A tibble: 19,515 x 16
## SID p_year age gender grsWages race physhlthevnt SRhealth smokes alcohol exercise BMI parDivorce PhysFunc religion parEdu
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 57100270 1995 18 1 55000 2 0 4 1 1 2 7.58 0 1 2 1
## 2 57100270 1996 19 1 55000 2 0 4 1 1 2 7.58 0 1 2 1
## 3 57100270 2001 24 1 55000 2 NA 4 1 1 2 7.58 0 1 2 1
## 4 57101310 1995 19 1 NA 2 0 NA 1 1 1 3.64 NA 1 0 0
## 5 57101310 1996 20 1 NA 2 0 NA 1 1 1 3.64 NA 1 0 0
## 6 57101310 2001 25 1 NA 2 NA NA 1 1 1 3.64 NA 1 0 0
## 7 57103171 1995 16 0 45000 0 0 3 0 0 2 NA 0 1 1 0
## 8 57103171 1996 17 0 45000 0 0 3 0 0 2 NA 0 1 1 0
## 9 57103171 2001 22 0 45000 0 0 3 0 0 2 NA 0 1 1 0
## 10 57103869 1995 18 0 9000 NA 0 1 NA 0 NA 5.38 0 1 1 NA
## # … with 19,505 more rows
2.1.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2} else{F}}
not_all_na <- function(x) any(!is.na(x))
mice_fun <- function(df){
df <- df %>% select(-bioParinHH, -ageMarried, -chldAgeHlthProb, -childHlthProb, -dadInHH,-momInHH, -parReligion, -liveMom, -yearsNoMom, -yearsNoDad, -yearMoveUS)
mice(df, m = 5, maxit=5, printFlag=TRUE)
}
start <- Sys.time()
addhealth_match_imp <- addhealth_match %>%
rename(NegAff = `NA`) %>%
mutate_if(factor_fun, as.factor) %>%
group_by(p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
print(Sys.time() - start)
addhealth_match_imp <- addhealth_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>% select(-PhysFunc) %>% mutate(PhysFunc = ifelse(physFunc > 0, 1, 0))),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
ungroup() %>% select(-momEdu, -dadEdu)),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
addhealth_match_imp_long <- addhealth_match_imp %>%
select(p_year, imp_df) %>%
unnest(imp_df)
addhealth_SCA_imp <- addhealth_match_imp %>%
select(p_year, imp_sca) %>%
unnest(imp_sca) %>%
left_join(addhealth_SCA %>% select(p_year, SID,
colnames(addhealth_SCA)[!colnames(addhealth_SCA) %in% colnames(.)])) %>%
mutate(education = factor(0), married = factor(0), numKids = 0, PhysFunc = factor(PhysFunc))
save(addhealth_match_imp_long, addhealth_SCA_imp,
file = sprintf("%s/data/imputed/addhealth_imputed_small.RData", wd))
save(addhealth_match_imp,
file = sprintf("%s/data/imputed/addhealth_imputed.RData", wd))
save(addhealth_alpha, addhealth_pers, addhealth_out, addhealth_match, addhealth_SCA,
file = sprintf("%s/data/clean/addhealth_cleaned.RData", wd))
save(addhealth_long, file = sprintf("%s/data/clean/addhealth_raw_long.RData", wd))
rm(list =ls()[grepl("addhealth", ls())])
2.2 US
2.2.1 Load Data
us_read_fun <- function(Year, WL){
old.names <- (us_codebook %>% filter(year == Year | year == 0))$orig_itemname
keep <- c("pidp", sprintf("%s_hidp", WL))
sprintf("%s/data/us/%s_indresp.sav", wd, WL) %>% haven::read_sav(.) %>%
select(one_of(old.names)) %>%
haven::zap_labels(.) %>%
gather(key = orig_itemname, value = value, -one_of(keep), na.rm = T) %>%
set_names(c("HHID", "PID", "orig_itemname", "value")) %>%
mutate(value = as.numeric(value))
}
us_codebook <- (codebook %>% filter(study == "US"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_lower(orig_itemname))
us_codebook
## # A tibble: 235 x 17
## study dataset category name itemname wave waveletter year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <lgl> <chr> <lgl> <lgl> <chr>
## 1 US xwavedat out chld… chldbrth 0 <NA> 0 US__out_chl… ch1by_dv Can you ple… Year… NA ifelse… NA NA max
## 2 US a_INDRE… out chld… chldbrth 1 a 2009 US__out_chl… a_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 3 US b_INDRE… out chld… chldbrth 2 b 2010 US__out_chl… b_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 4 US c_INDRE… out chld… chldbrth 3 c 2011 US__out_chl… c_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 5 US d_INDRE… out chld… chldbrth 4 d 2012 US__out_chl… d_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 6 US e_INDRE… out chld… chldbrth 5 e 2013 US__out_chl… e_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 7 US f_INDRE… out chld… chldbrth 6 f 2014 US__out_chl… f_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 8 US g_INDRE… out chld… chldbrth 7 g 2015 US__out_chl… g_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 9 US h_INDRE… out chld… chldbrth 8 h 2016 US__out_chl… h_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## 10 US i_INDRE… out chld… chldbrth 9 i 2017 US__out_chl… i_ch1by4 Can you ple… Year… NA ifelse… NA NA max
## # … with 225 more rows
us_xwaveid <- sprintf("%s/data/us/xwaveid.sav", wd) %>% read_sav() %>%
filter(hhorig %in% c(3:6))
us <- us_codebook %>%
select(wave, waveletter, year) %>%
distinct() %>%
filter(year != 0) %>%
mutate(data = map2(year, waveletter, us_read_fun))
us_long <- us %>%
unnest(data)
save(us, file = sprintf("%s/data/clean/us_raw.RData", wd))
rm(us)
2.2.2 Outcome Variables
bhps_waves <- p_waves %>% filter(Study == "BHPS") %>% select(Used) %>% distinct()
us_out <- us_codebook %>%
select(-new_itemname) %>%
filter(category == "out") %>%
# full_join(crossing(name = unique((.)$name), p_year = bhps_waves$Used)) %>%
left_join(us_long)
# recode
recode_fun <- function(rule, y){
print(rule)
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
us_out <- us_out %>%
select(name, itemname, year, reverse_code:comp_rule, PID, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data), recode_fun)) %>%
unnest(data)
## # A tibble: 9,282,789 x 10
## recode name itemname year reverse_code mini maxi comp_rule PID value
## <chr> <chr> <chr> <dbl> <lgl> <lgl> <lgl> <chr> <dbl> <dbl>
## 1 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68001367 0
## 2 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68004087 0
## 3 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68006127 0
## 4 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68006135 0
## 5 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68006807 0
## 6 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68007487 0
## 7 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68007491 0
## 8 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68007495 0
## 9 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68007499 0
## 10 ifelse(x < 0 & !is.na(x), 0, ifelse(x > 2008, 1, NA)) chldbrth chldbrth 2009 NA NA NA max 68008167 0
## # … with 9,282,779 more rows
2.3 BHPS
The British Household Panel Study (BHPS; University of Essex, 2018) is a longitudinal study of households in the United Kingdom. These data are available online, through application, from https://www.iser.essex.ac.uk/bhps/about/latest-release-of-bhps-data.
Participants were recruited from more than 15,000 individuals from approximately 8,000 households in the United Kingdom. Data have been collected annually since 1991 from approximately 10,000 individuals (5,500 households) in Great Britain but expanded to include Scotland and Wales in 1999 and Northern Ireland in 2001. In 2010, the BHPS stopped data collection, but 6,700 of the current 8,000 participants were solicited to become part of the broader Understanding Society study (University of Essex, 2019). Participants can be matched across studies, so I will use additional data on the original BHPS participants from the Understanding Society study for additional waves of outcome data.
Sample sizes vary by year, ranging from 10,264 (1991) to 14419 (2008). This provides 99% power to detect a zero-order correlation effect size of ~.05, two-tailed at alpha .05.
2.3.1 Load Data
bhps_read_fun <- function(Year, WL){
old.names <- (bhps_codebook %>% filter(year == Year | year == 0))$orig_itemname
keep <- c("pidp", sprintf("%shid", WL))
sprintf("%s/data/bhps/%sindresp.sav", wd, WL) %>% haven::read_sav(.) %>%
full_join(sprintf("%s/data/bhps/%segoalt.sav", wd, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/data/bhps/%shhresp.sav", wd, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/data/bhps/%sincome.sav", wd, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/data/bhps/%sjobhist.sav", wd, WL) %>% haven::read_sav(.)) %>%
select(one_of(keep), one_of(old.names)) %>%
haven::zap_labels(.) %>%
gather(key = orig_itemname, value = value, -one_of(keep), na.rm = T) %>%
set_names(c("PID", "HHID", "orig_itemname", "value")) %>%
mutate(value = as.numeric(value))
}
bhps_codebook <- (codebook %>% filter(study == "BHPS"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_lower(orig_itemname))
bhps_codebook
## # A tibble: 1,878 x 23
## study dataset category name itemname wave waveletter year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 bhps aINDRE… match acci… numAcci… 1 a 1991 bhps__match… anxdts No. of ser… "Val… <NA> ifels… NA NA max
## 2 bhps bINDRE… match acci… numAcci… 2 b 1992 bhps__match… bnxdts No. of ser… "Val… <NA> ifels… NA NA max
## 3 bhps cINDRE… match acci… numAcci… 3 c 1993 bhps__match… cnxdts No. of ser… "Val… <NA> ifels… NA NA max
## 4 bhps dINDRE… match acci… numAcci… 4 d 1994 bhps__match… dnxdts No. of ser… "Val… <NA> ifels… NA NA max
## 5 bhps eINDRE… match acci… numAcci… 5 e 1995 bhps__match… enxdts No. of ser… "Val… <NA> ifels… NA NA max
## 6 bhps fINDRE… match acci… numAcci… 6 f 1996 bhps__match… fnxdts No. of ser… "Val… <NA> ifels… NA NA max
## 7 bhps gINDRE… match acci… numAcci… 7 g 1997 bhps__match… gnxdts No. of ser… "Val… <NA> ifels… NA NA max
## 8 bhps hINDRE… match acci… numAcci… 8 h 1998 bhps__match… hnxdts No. of ser… "Val… <NA> ifels… NA NA max
## 9 bhps iINDRE… match acci… numAcci… 9 i 1999 bhps__match… inxdts No. of ser… "Val… <NA> ifels… NA NA max
## 10 bhps jINDRE… match acci… numAcci… 10 j 2000 bhps__match… jnxdts No. of ser… "Val… <NA> ifels… NA NA max
## # … with 1,868 more rows, and 6 more variables: ...18 <chr>, ...19 <chr>, ...20 <chr>, ...21 <chr>, ...22 <chr>, ...23 <chr>
bhps_xwaveid <- sprintf("%s/data/bhps/xwaveid.sav", wd) %>% read_sav()
bhps <- bhps_codebook %>%
select(wave, waveletter, year) %>%
distinct() %>%
filter(year != 0) %>%
mutate(data = map2(year, waveletter, bhps_read_fun))
bhps_long <- bhps %>%
unnest(data) %>%
rename(SID = PID) %>%
distinct()
save(bhps, file = sprintf("%s/data/clean/bhps_raw.RData", wd))
rm(bhps)
2.3.2 Matching Variables
recode_fun <- function(rule, y, year){
print(rule)
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# rename variables
bhps_match <- bhps_codebook %>%
filter(category == "match") %>%
select(-wave, -waveletter) %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(bhps_long)))
yrBrth <- bhps_codebook %>%
filter(name == "yearBrth") %>%
left_join(bhps_long) %>%
filter(!is.na(value)) %>%
group_by(SID) %>%
summarize(yearBrth = Mode(value)) %>%
ungroup()
bhps_match <- bhps_match %>%
filter(name != "yearBrth") %>%
mutate(data = map(data, ~(.) %>% full_join(yrBrth)))
# recode
bhps_match <- bhps_match %>%
mutate(data = map(data, ~(.) %>%
select(SID, year, recode, itemname, value, yearBrth) %>%
distinct() %>%
group_by(recode, year, itemname) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)))
bhps_htwt <- bhps_match %>%
filter(name %in% c("height", "weight")) %>%
unnest(data) %>%
filter(!is.na(value)) %>%
select(year, name, itemname, value, SID) %>%
group_by(SID, itemname, year) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
spread(itemname, value) %>%
# filter(weightStn != 0) %>%
mutate(height = rowSums(cbind(heightft, heightin), na.rm = T),
weight = ifelse(is.na(weightStn), weightkg, rowSums(cbind(weightlbs, weightStn), na.rm = T)),
height = height/100) %>%
filter(height > .5 & height < 2.5 & weight > 20 & weight < 150) %>%
mutate(BMI = weight / (height)^2,
BMI = ifelse(is.infinite(BMI), NA, BMI)) %>%
select(-heightft, -heightin, -weightlbs, -weightStn, -weightkg, -year) %>%
gather(key = name, value = value, height, weight, BMI) %>%
group_by(SID, name) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
spread(name, value) %>%
mutate_all(~ifelse(is.infinite(.) | is.nan(.), NA, .))
bhps_relig <- bhps_codebook %>%
filter(name == "religion") %>%
left_join(bhps_long) %>%
group_by(year, recode) %>%
nest() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data) %>%
ungroup() %>%
filter(!is.na(value)) %>%
group_by(SID, name) %>%
summarize(value = Mode(value))
# reverse code
bhps_match <- bhps_match %>%
mutate(data = map(data, ~(.) %>%
select(-recode) %>%
left_join(bhps_codebook %>% select(year, itemname, reverse_code, mini:comp_rule)) %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi)))))
parIDs <- bhps_match %>%
filter(name %in% c("momID", "dadID")) %>%
unnest(data) %>%
select(SID, year, parID = name, value) %>%
distinct() %>%
filter(value != 0) %>%
group_by(SID, parID) %>%
filter(year == min(year)) %>%
ungroup() %>%
spread(parID, value) %>%
mutate_at(vars(momID, dadID),
~mapvalues(., bhps_xwaveid$pid, bhps_xwaveid$pidp, warn_missing = F))
parWages <- bhps_match %>%
filter(name == "grsWages") %>%
unnest(data) %>%
rename(momID = SID) %>%
right_join(parIDs %>% select(momID, SID)) %>%
filter(!is.na(value)) %>%
mutate(itemname = "momGrsWages") %>%
select(-(reverse_code:maxi), -momID) %>%
full_join(
bhps_match %>%
filter(name == "grsWages") %>%
unnest(data) %>%
rename(dadID = SID) %>%
right_join(parIDs %>% select(dadID, SID)) %>%
filter(!is.na(value)) %>%
mutate(itemname = "dadGrsWages") %>%
select(-(reverse_code:maxi), -dadID)
) %>%
mutate(name = "parGrsWages") %>%
group_by(year, name, SID) %>%
summarize(value = mean(value, na.rm = T)) %>%
group_by(SID) %>%
summarize(parGrsWages = mean(value, na.rm = T)) %>%
ungroup()
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
rule <- ifelse(is.na(rule), "skip", rule)
df %>%
group_by(SID, yearBrth, year) %>% # group by person and item (collapse across age)
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
bhps_waves <- p_waves %>% filter(Study == "BHPS") %>% select(Used) %>% distinct()
bhps_match <- bhps_match %>%
mutate(data = map(data, ~(.) %>%
left_join(yrBrth) %>%
filter(year <= max(bhps_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data))) %>%
unnest(data)
comp_fun <- function(rule, p_year){
rule <- ifelse(is.na(rule), "skip", rule)
bhps_match %>%
filter(year <= p_year & comp_rule == rule) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
bhps_match <- crossing(
p_year = bhps_waves$Used,
comp_rule = unique(bhps_match$comp_rule)
) %>%
mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
select(-comp_rule) %>%
filter(name != "HHID") %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = ~max(., na.rm = T))) %>%
mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
full_join(bhps_htwt) %>%
full_join(parWages) %>%
filter(!is.na(p_year))
## # A tibble: 95,699 x 67
## SID p_year A C DEP E LOC N O OP SE SS SWB SWL yearBrth drVisits genderRoles grsWages nbhdQual SRhealth
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9527 1996 NA NA 1.83 NA NA NA NA NA 1 NA 10 4 1961 2.33 4.11 13186. 0.833 3.17
## 2 12251 1996 NA NA 1.83 NA NA NA NA NA 1 NA 10 5 1971 3.67 3.89 10485. 1 3.17
## 3 12935 1996 NA NA 1 NA NA NA NA NA 1 NA 0 4 1973 2.4 4 13109. 0.8 4.8
## 4 14287 1996 NA NA 1.92 NA NA NA NA NA 2 NA 11 5 1961 3.5 3.22 10948. 0.167 2.5
## 5 14971 1996 NA NA 2.75 NA NA NA NA NA 3 NA 21 2 1962 1.83 3.22 10948. 0.167 3.17
## 6 15645 1996 NA NA 2.5 NA NA NA NA NA 3 NA 18 2 1956 2 2.67 14686. 1 4
## 7 16339 1996 NA NA NA NA NA NA NA NA NA NA NA NA 1972 2.6 4.11 15111. 1 4.83
## 8 17687 1996 NA NA 1.75 NA NA NA NA NA 1 NA 9 6 1969 2.5 4.89 11266. 0.833 4.17
## 9 21087 1996 NA NA 1.92 NA NA NA NA NA 1 NA 11 3 1964 2.33 3.44 28184. 1 4.17
## 10 21767 1996 NA NA 2.08 NA NA NA NA NA 1 NA 13 3 1945 3.17 4.89 29317. 1 3.33
## # … with 95,689 more rows, and 47 more variables: religiosity <dbl>, exercise <dbl>, satFam <dbl>, satHH <dbl>, accidents <dbl>, disability <dbl>,
## # education <dbl>, employed <dbl>, homeHelp <dbl>, married <dbl>, smokes <dbl>, unempBen <dbl>, welfare <dbl>, mvdBigSlry <dbl>, mvdForWork <dbl>,
## # numKids <dbl>, debt <dbl>, investments <dbl>, savings <dbl>, PhysFunc <dbl>, dadID <dbl>, dentalIns <dbl>, dentist <dbl>, domesticDuties <dbl>,
## # eyeDr <dbl>, eyeDrIns <dbl>, gender <dbl>, momID <dbl>, prvntvHlthChcks <dbl>, neighbors <dbl>, satSchl <dbl>, ageMarried <dbl>, weight <dbl>,
## # height <dbl>, momAgeAtBrth <dbl>, numSiblng <dbl>, BMI <dbl>, parGrsWages <dbl>, physhlthevnt <dbl>, age <dbl>, dadEdu <dbl>, dadOccPrstg <dbl>,
## # momEdu <dbl>, momOccPrstg <dbl>, parDivorce <dbl>, race <dbl>, religion <dbl>
2.3.3 Personality Variables
# keep correct personality waves
bhps_pers <- bhps_codebook %>% mutate(reverse_code = tolower(reverse_code)) %>%
select(-new_itemname) %>%
filter(category == "pers") %>%
left_join(bhps_long) %>%
left_join(p_waves %>% filter(Study == "BHPS") %>% select(name = p_item, Used)) %>%
filter(year %in% bhps_waves$Used) %>%
distinct()
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
bhps_pers <- bhps_pers %>%
select(name, itemname, year, reverse_code:comp_rule, SID:value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
bhps_pers <- bhps_pers %>% distinct() %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
bhps_alpha <- bhps_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% distinct() %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
comp_fun <- function(df, rule){
df %>%
group_by(SID, HHID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
# create composites
bhps_pers <- bhps_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, comp_fun)) %>%
unnest(data) %>%
select(-comp_rule, -HHID)
## # A tibble: 291,507 x 4
## name year SID value
## <chr> <dbl> <dbl> <dbl>
## 1 A 2005 14971 4
## 2 A 2005 15645 6
## 3 A 2005 16339 NA
## 4 A 2005 17015 5.67
## 5 A 2005 22445 5.33
## 6 A 2005 23807 5.33
## 7 A 2005 27284 4
## 8 A 2005 28575 3.67
## 9 A 2005 30615 5.67
## 10 A 2005 46927 4.67
## # … with 291,497 more rows
2.3.4 Outcome Variables
bhps_subs <- unique(bhps_pers$SID)
us_out <- us_out %>% filter(PID %in% bhps_subs)
exit <- sprintf("%s/data/bhps/xwlsten.sav", wd) %>% haven::read_sav(.) %>%
haven::zap_labels(.) %>%
select(SID = pidp, lrio, lewave) %>%
mutate(lrio = ifelse(lrio == 99, 1990 + lewave, NA)) %>%
full_join(crossing(SID = unique((.)$SID), p_year = bhps_waves$Used)) %>%
# filter(!value < 0)
mutate(year = ifelse(is.na(lrio) | lrio < 0, NA, lrio),
value = ifelse(year <= p_year, NA, ifelse(is.na(year), 0, ifelse(year > p_year, 1, NA)))) %>%
select(-year, -lrio, -lewave) %>% mutate(name = "mortality")
bhps_out <- bhps_codebook %>%
select(-new_itemname) %>%
filter(category == "out") %>%
left_join(bhps_long)
recode_fun <- function(rule, y, p_year){
print(rule)
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
bhps_out <- bhps_out %>%
select(name, itemname, year, reverse_code:comp_rule, SID:value) %>%
full_join(crossing(name = unique((.)$name), p_year = bhps_waves$Used)) %>%
group_by(recode, p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data)
# composite within years
# compositing within years
bhps_out <- bhps_out %>%
select(-(reverse_code:maxi), -HHID, -recode) %>%
full_join(us_out %>% select(p_year:year, comp_rule, SID = PID, value)) %>%
group_by(SID, name, year, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))
# composite across years
comp_fun <- function(p_year){
bhps_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(SID, name, group,) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
bhps_out <- bhps_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(SID, name, group, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name, p_year) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA)))) %>%
ungroup()
bhps_out <- bhps_out %>%
filter(name != "mortality") %>%
full_join(
bhps_out %>%
filter(name == "mortality") %>%
full_join(exit) %>%
distinct() %>%
group_by(SID, name, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value))
)
## # A tibble: 1,154,787 x 6
## SID name p_year past future value
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 687 divorced 1996 0 NA 0
## 2 687 divorced 2001 0 NA 0
## 3 687 divorced 2005 0 NA 0
## 4 687 edu 1996 0 NA 0
## 5 687 edu 2001 0 NA 0
## 6 687 edu 2005 0 NA 0
## 7 687 frstjob 1996 1 NA NA
## 8 687 frstjob 2001 1 NA NA
## 9 687 frstjob 2005 1 NA NA
## 10 687 mntlhlthevnt 1996 0 NA 0
## # … with 1,154,777 more rows
2.3.5 Covariates
bhps_match <- bhps_pers %>%
spread(name, value) %>%
rename(p_year = year) %>%
full_join(bhps_match) %>%
# physical health event
left_join(bhps_out %>% filter(name == "physhlthevnt") %>% select(p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, p_year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct()
# parental divorce
bhps_match <- bhps_out %>%
filter(name == "divorced") %>%
rename(momID = SID) %>%
select(-name) %>%
right_join(parIDs %>% select(momID, SID)) %>%
filter(!is.na(past)) %>%
mutate(itemname = "momDivorced", name = "parDivorce") %>%
select(p_year, SID, name, value = past, itemname) %>%
full_join(
bhps_out %>%
filter(name == "divorced") %>%
rename(dadID = SID) %>%
right_join(parIDs %>% select(dadID, SID)) %>%
filter(!is.na(past)) %>%
mutate(itemname = "dadDivorced", name = "parDivorce") %>%
select(p_year, SID, name, value = past, itemname)
) %>%
group_by(p_year, SID, name) %>%
summarize(parDivorce = max(value, na.rm = T)) %>%
ungroup() %>%
select(-name) %>%
right_join(bhps_match)
bhps_match <- bhps_codebook %>%
filter(grepl("OccPrstg", name)) %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(bhps_long))) %>%
unnest(data) %>%
mutate(value = ifelse(value < 0, NA, value)) %>%
filter(!is.na(value)) %>%
group_by(SID, name) %>%
filter(year == min(year)) %>%
select(name, SID, value) %>%
distinct() %>%
spread(name, value) %>%
right_join(bhps_match) %>%
ungroup() %>%
filter(!is.na(p_year)) %>%
# select(-comp_rule, -HHID) %>%
mutate_all(~ifelse(is.infinite(.), NA, .))
bhps_out <- bhps_out %>%
select(-past, -future) %>%
distinct()
bhps_dem <- bhps_match %>%
select(SID, dadOccPrstg, momOccPrstg, parDivorce,
race = ethnicity, momEdu, dadEdu)%>%
gather(key = item, value = value, -SID, na.rm = T) %>%
group_by(SID, item) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
spread(item,value) %>%
full_join(bhps_relig %>% spread(name, value))
# age
bhps_match <- bhps_match %>%
mutate(age = p_year - yearBrth) %>%
select(-dadOccPrstg, -momOccPrstg, -parDivorce,
-ethnicity, -momEdu, -dadEdu, -religion) %>%
full_join(bhps_dem)
bhps_SCA <- bhps_match %>%
select(SID, p_year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
## # A tibble: 95,699 x 19
## SID p_year age education gender grsWages race physhlthevnt SRhealth smokes exercise BMI married numKids parDivorce PhysFunc religion parEdu
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9527 1996 35 0 1 13186. 0 1 3.17 1 3.5 NA 0 1 NA NA NA NA
## 2 12251 1996 25 0 1 10485. 2 1 3.17 0 1 NA 0 NA 1 NA NA 0
## 3 12935 1996 23 0 1 13109. 1 0 4.8 1 3 NA 0 NA 1 NA NA NA
## 4 14287 1996 35 0 0 10948. 0 1 2.5 1 1 NA 1 1 NA NA NA NA
## 5 14971 1996 34 0 1 10948. 1 1 3.17 1 2 NA 1 1 NA NA NA 1
## 6 15645 1996 40 1 1 14686. 0 1 4 0 3 NA 1 3 NA NA NA 0
## 7 16339 1996 24 0 1 15111. 0 0 4.83 0 NA NA 0 NA 0 NA NA NA
## 8 17687 1996 27 0 1 11266. 0 0 4.17 1 3 NA 0 1 NA NA NA NA
## 9 21087 1996 32 0 1 28184. 1 1 4.17 1 3.5 NA 0 NA NA NA NA NA
## 10 21767 1996 51 2 1 29317. 0 0 3.33 0 5 NA 0 NA NA NA NA NA
## # … with 95,689 more rows, and 1 more variable: parOccPrstg <dbl>
2.3.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2} else{F}}
not_all_na <- function(x) any(!is.na(x))
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
df <- df %>% select(-homeHelp, -dentalIns, -eyeDrIns)
mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
}
start <- Sys.time()
bhps_match_imp <- bhps_match %>%
group_by(SID, p_year) %>%
mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
ungroup() %>%
# filter(p_year==2001) %>%
select(everything(), -dadID, -momID, -momOccPrstg,-dadOccPrstg) %>%
filter(!is.na(p_year) & !is.na(SID) & !is.na(yearBrth)) %>%
mutate(parOccPrstg = ifelse(is.infinite(parOccPrstg), NA, parOccPrstg)) %>%
group_by(p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
print(Sys.time() - start)
bhps_match_imp <- bhps_match_imp %>%
filter(year != 1991) %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
bhps_match_imp_long <- bhps_match_imp %>%
select(p_year, imp_df) %>%
unnest(imp_df) %>%
select(-SWB)
bhps_SCA_imp <- bhps_match_imp %>%
select(p_year, imp_sca) %>%
unnest(imp_sca) %>%
full_join(bhps_SCA %>% select(p_year = year, SID, BMI))
unique(specifications$name)[!unique(specifications$name) %in% colnames(bhps_SCA_imp)]
save(bhps_match_imp_long, bhps_SCA_imp,
file = sprintf("%s/data/imputed/bhps_imputed_small.RData", wd))
save(bhps_match_imp,
file = sprintf("%s/data/imputed/bhps_imputed.RData", wd))
save(bhps_alpha, bhps_pers, bhps_out, bhps_match, bhps_SCA,
file = sprintf("%s/data/clean/bhps_cleaned.RData", wd))
save(bhps_long, file = sprintf("%s/data/clean/bhps_raw_long.RData", wd))
rm(list =ls()[grepl("bhps", ls())])
rm(list = ls()[grepl("us", ls())])
2.4 GSOEP
The German Socioeconomic Panel Study (GSOEP; Socio-Economic Panel, 2017) is an ongoing longitudinal study of German collected by the German Institute of Economic Research (DIW Berlin). The data are freely available at https://www.diw.de/soep by application.
Data have been collected annually since 1984 (the latest data release includes data up to 2017). Participants have been recruited from more than 11,000 households, which are nationally representative of private German households. 20,000 individuals are sampled each year, on average. It is critical to note that the GSOEP samples households, not individuals, and the households consist of individuals living in both the “old” and “new” federal states (the former West and East Germany), foreigners, and recent immigrants to Germany.
Sample size varies by year, ranging from approximately 10,000 (1989) to 31,000 (2013). This provides 99% power to detect a zero-order correlation effect size of ~.06, two-tailed at alpha < .05.
2.4.1 Load Data
gsoep_read_fun <- function(Year, WL){
old.names <- (gsoep_codebook %>% filter(year == Year | category == "proc"))$orig_itemname
p <- sprintf("%s/data/gsoep/%sp.sav", wd, WL) %>% haven::read_sav(.) %>%
full_join(sprintf("%s/data/gsoep/%skind.sav", wd, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/data/gsoep/%spequiv.sav", wd, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/data/gsoep/%spgen.sav", wd, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/data/gsoep/%spkal.sav", wd, WL) %>% haven::read_sav(.)) %>%
select(one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -persnr, -hhnr, na.rm = T)
sprintf("%s/data/gsoep/%shbrutto.sav", wd, WL) %>% haven::read_sav(.) %>%
full_join(sprintf("%s/data/gsoep/%sh.sav", wd, WL) %>% haven::read_sav(.)) %>%
select(one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -hhnr, na.rm = T) %>%
full_join(p %>% select(persnr, hhnr) %>% distinct()) %>%
full_join(p)
}
gsoep_codebook <- (codebook %>% filter(study == "GSOEP"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_lower(orig_itemname))
gsoep_codebook
## # A tibble: 1,843 x 20
## study dataset category name itemname wave waveletter year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 gsoep apequiv match age age 1 a 1984 gsoep__matc… d1110184 Age of Ind… <NA> <NA> ifels… NA NA skip
## 2 gsoep bpequiv match age age 2 b 1985 gsoep__matc… d1110185 Age of Ind… <NA> <NA> ifels… NA NA skip
## 3 gsoep cpequiv match age age 3 c 1986 gsoep__matc… d1110186 Age of Ind… <NA> <NA> ifels… NA NA skip
## 4 gsoep dpequiv match age age 4 d 1987 gsoep__matc… d1110187 Age of Ind… <NA> <NA> ifels… NA NA skip
## 5 gsoep epequiv match age age 5 e 1988 gsoep__matc… d1110188 Age of Ind… <NA> <NA> ifels… NA NA skip
## 6 gsoep fpequiv match age age 6 f 1989 gsoep__matc… d1110189 Age of Ind… <NA> <NA> ifels… NA NA skip
## 7 gsoep gpequiv match age age 7 g 1990 gsoep__matc… d1110190 Age of Ind… <NA> <NA> ifels… NA NA skip
## 8 gsoep hpequiv match age age 8 h 1991 gsoep__matc… d1110191 Age of Ind… <NA> <NA> ifels… NA NA skip
## 9 gsoep ipequiv match age age 9 i 1992 gsoep__matc… d1110192 Age of Ind… <NA> <NA> ifels… NA NA skip
## 10 gsoep jpequiv match age age 10 j 1993 gsoep__matc… d1110193 Age of Ind… <NA> <NA> ifels… NA NA skip
## # … with 1,833 more rows, and 3 more variables: ...18 <dbl>, ...19 <lgl>, ...20 <lgl>
gsoep <- gsoep_codebook %>%
select(wave, waveletter, year) %>%
filter(complete.cases(.)) %>%
distinct() %>%
mutate(data = map2(year, waveletter, gsoep_read_fun))
old.names <- unique(gsoep_codebook$orig_itemname)
gsoep_post <- sprintf("%s/data/gsoep/gpost.sav", wd) %>% haven::read_sav(.) %>%
full_join(sprintf("%s/data/gsoep/hpost.sav", wd) %>% haven::read_sav(.)) %>%
select(persnr, hhnr, one_of(old.names)) %>%
haven::zap_labels(.) %>%
select(-hhnr) %>%
gather(key = orig_itemname, value = value, -persnr, na.rm = T)
gsoep_long <- gsoep %>% unnest(data) %>%
select(-hhnr) %>%
full_join(gsoep_post)
save(gsoep, file = sprintf("%s/data/clean/gsoep_raw.RData", wd))
rm(gsoep)
2.4.2 Matching Variables
recode_fun <- function(rule, y, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# rename variables
gsoep_match <- gsoep_codebook %>%
filter(name == "yearBrth") %>%
filter(category == "match") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(gsoep_long)))
yrBrth <- gsoep_match %>%
filter(name == "yearBrth") %>%
unnest(data) %>%
group_by(persnr) %>%
summarize(yearBrth = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth))
# recode
gsoep_match <- gsoep_match %>%
filter(name != "yearBrth") %>%
mutate(data = map(data, ~(.) %>%
left_join(yrBrth) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)))
# reverse code
gsoep_match <- gsoep_match %>%
mutate(data = map(data, ~(.) %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
group_by(persnr, yearBrth, year) %>% # group by person and item (collapse across age)
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()
gsoep_match <- gsoep_match %>%
mutate(data = map(data, ~(.) %>%
filter(year <= max(gsoep_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data))) %>%
unnest(data)
comp_fun <- function(rule, p_year){
gsoep_match %>%
filter(year <= p_year & comp_rule == rule) %>%
group_by(persnr, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
gsoep_match <- crossing(
p_year = gsoep_waves$Used,
comp_rule = unique(gsoep_match$comp_rule)
) %>%
mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule, -yearBrth) %>%
pivot_wider(names_from = name, values_from = value)
## # A tibble: 346,921 x 62
## year SID A C DEP E LOC N `NA` O OP PA SE SS SWL drVisits exercise grsWages satHealth satHH satIncome social
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1994 201 NA NA NA NA 2.62 NA NA NA NA NA NA NA 5 20 1 2979. 6.5 3.5 5.36 2.71
## 2 1994 203 NA NA NA NA 3.25 NA NA NA NA NA NA NA 6 3.5 4 5054. 9.5 3.5 6.5 1.42
## 3 1994 204 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 2599. NA 3.88 NA NA
## 4 1994 601 NA NA NA NA 3.5 NA NA NA NA NA NA NA 9 9.33 3.86 31259. 7.4 2.83 7.18 1.93
## 5 1994 602 NA NA NA NA 3 NA NA NA NA NA NA NA 4 9.33 2.86 35436. 6.6 2.83 7.27 1.79
## 6 1994 603 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 66717. NA 3.08 NA NA
## 7 1994 604 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 69202. NA 3.62 NA NA
## 8 1994 901 NA NA NA NA 3.12 NA NA NA NA NA NA NA 6 18.2 3.71 13774. 6.6 3.33 5.91 2.07
## 9 1994 1101 NA NA NA NA 2.25 NA NA NA NA NA NA NA 10 29.8 1 1940. 3.7 4 7.18 2.29
## 10 1994 1202 NA NA NA NA 2.62 NA NA NA NA NA NA NA 9 21.8 1 2989. 5.1 3.6 8 2.71
## # … with 346,911 more rows, and 40 more variables: SRhealth <dbl>, nbhdQual <dbl>, religiosity <dbl>, disability <dbl>, education <dbl>, HHsize <dbl>,
## # hlthcare <dbl>, married <dbl>, numKids <dbl>, PhysFunc <dbl>, unempBen <dbl>, urban <dbl>, broPres <dbl>, dadPres <dbl>, momPres <dbl>,
## # religion <dbl>, sisPres <dbl>, ageMarried <dbl>, gender <dbl>, age <dbl>, occPrestige <dbl>, height <dbl>, smokes <dbl>, weight <dbl>,
## # welfare <dbl>, eatingHabits <dbl>, dadAlive <dbl>, momAlive <dbl>, alcohol <dbl>, satFam <dbl>, auntPres <dbl>, gmaPres <dbl>, gpaPres <dbl>,
## # unclPres <dbl>, dadEdu <dbl>, dadOccPrstg <dbl>, momEdu <dbl>, momOccPrstg <dbl>, physhlthevnt <dbl>, BMI <dbl>
2.4.3 Personality Variables
# keep correct personality waves
gsoep_pers <- gsoep_codebook %>%
select(-new_itemname) %>%
filter(category == "pers") %>%
left_join(gsoep_long) %>%
left_join(p_waves %>% filter(Study == "GSOEP") %>% select(name = p_item, Used)) %>%
filter(year %in% Used)
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
gsoep_pers <- gsoep_pers %>%
select(name, itemname, year, reverse_code:comp_rule, persnr, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
# reverse code
gsoep_pers <- gsoep_pers %>% distinct() %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
gsoep_alpha <- gsoep_pers %>%
select(name, itemname, year, persnr, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-persnr)), NA_real_)))
comp_fun <- function(df, rule){
df %>%
group_by(persnr) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
# create composites
gsoep_pers <- gsoep_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, comp_fun)) %>%
unnest(data) %>%
select(-comp_rule) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
## # A tibble: 504,605 x 4
## name year SID value
## <chr> <dbl> <dbl> <dbl>
## 1 A 2005 201 7
## 2 A 2005 203 4.67
## 3 A 2005 602 4.33
## 4 A 2005 901 4.67
## 5 A 2005 1202 7
## 6 A 2005 1501 4.67
## 7 A 2005 1601 4
## 8 A 2005 1602 6.67
## 9 A 2005 1603 4.33
## 10 A 2005 1701 7
## # … with 504,595 more rows
2.4.4 Outcome Variables
gsoep_pers_subs <- unique(gsoep_pers$persnr)
gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()
exit <- sprintf("%s/data/gsoep/ppfad.sav", wd) %>% haven::read_sav(.) %>%
select(persnr, todjahr) %>%
gather(key = orig_itemname, value = value, -persnr) %>%
full_join(crossing(orig_itemname = "todjahr", p_year = gsoep_waves$Used)) %>%
# filter(!value < 0)
mutate(year = ifelse(is.na(value) | value < 0, NA, value),
value = ifelse(year <= p_year | is.na(year), 0, ifelse(year > p_year, 1, NA))) %>%
left_join(gsoep_codebook %>% select(orig_itemname, name)) %>%
select(-year, -orig_itemname) %>%
filter(persnr %in% gsoep_pers_subs)
gsoep_out <- gsoep_codebook %>%
select(-new_itemname) %>%
filter(category == "out") %>%
left_join(gsoep_long %>% filter(persnr %in% gsoep_pers_subs))
# recode
gsoep_out <- gsoep_out %>%
select(name, itemname, year, reverse_code:comp_rule, persnr, recode, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# composite within years
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
group_by(persnr, name, year) %>% # group by person and item (collapse across age)
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
gsoep_out <- gsoep_out %>%
# filter(year <= max(gsoep_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(comp_rule == "select", "skip", comp_rule),
data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
# composite across years
comp_fun <- function(p_year){
gsoep_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(persnr, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(persnr, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
gsoep_out <- tibble(p_year = gsoep_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data) %>%
filter(name != "mortality") %>%
full_join(exit)
## # A tibble: 4,008,294 x 6
## p_year SID name future past value
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2005 201 chldbrth 0 0 0
## 2 2005 201 chldmvout 0 0 0
## 3 2005 201 divorced 0 0 0
## 4 2005 201 edu NA NA NA
## 5 2005 201 frstJob 0 0 0
## 6 2005 201 married 0 0 0
## 7 2005 201 mntlhlthevnt 0 0 0
## 8 2005 201 mvInPrtnr 0 0 0
## 9 2005 201 physhlthevnt 0 1 NA
## 10 2005 201 retired 1 1 NA
## # … with 4,008,284 more rows
2.4.5 Covariates
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
old.names <- unique(gsoep_codebook$orig_itemname)
gsoep_bp <- sprintf("%s/data/gsoep/bioparen.sav", wd) %>%
haven::read_sav(.) %>%
select(one_of(old.names), -hhnr) %>%
gather(key = orig_itemname, value = value, -persnr, -fnr, -mnr, na.rm = T) %>%
left_join(gsoep_codebook) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data), recode_fun)) %>%
unnest(data) %>%
filter(!is.na(value)) %>%
group_by(persnr, name, fnr, mnr) %>%
summarize(value = fun_call(value, comp_rule)) %>%
ungroup() %>%
spread(name, value) %>%
mutate_at(vars(fnr, mnr), ~ifelse((.) < 0, NA, .)) %>%
rename(dadID = fnr, momID = mnr)
# parental divorce
gsoep_match <- gsoep_out %>%
filter(name == "divorced") %>%
rename(momID = persnr) %>%
right_join(gsoep_bp %>% select(momID, persnr)) %>%
filter(!is.na(past)) %>%
mutate(itemname = "momDivorced", name = "parDivorce") %>%
select(p_year, persnr, name, value = past, itemname) %>%
full_join(
gsoep_out %>%
filter(name == "divorced") %>%
rename(dadID = persnr) %>%
right_join(gsoep_bp %>% select(dadID, persnr)) %>%
filter(!is.na(past)) %>%
mutate(itemname = "dadDivorced", name = "parDivorce") %>%
select(p_year, persnr, name, value = past, itemname)
) %>%
group_by(p_year, persnr, name) %>%
summarize(parDivorce = max(value, na.rm = T)) %>%
ungroup() %>%
select(-name) %>%
right_join(gsoep_match)
gsoep_match <- gsoep_pers %>%
spread(name, value) %>%
full_join(gsoep_match %>% rename(year = p_year)) %>%
select(-momOccPrstg, -dadOccPrstg) %>%
full_join(gsoep_bp %>% select(-momID, -dadID)) %>%
# physical health event
left_join(gsoep_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, persnr, physhlthevnt = past) %>% distinct() %>% group_by(persnr, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct()
gsoep_out <- gsoep_out %>%
group_by(persnr, p_year, name) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
distinct() %>%
mutate(value = ifelse(is.infinite(value), NA, value))
gsoep_match <- gsoep_match %>%
group_by(persnr, year) %>%
mutate(BMI = ifelse(!is.na(weight) & !is.na(height), weight/(height/100)^2, NA)) %>%
ungroup()
gsoep_SCA <- gsoep_match %>%
select(persnr, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(persnr) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
unique(specifications$name)[!unique(specifications$name) %in% colnames(gsoep_match)]
## # A tibble: 346,921 x 18
## SID p_year age education gender grsWages physhlthevnt SRhealth smokes alcohol exercise BMI married numKids PhysFunc religion parEdu parOccPrstg
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 201 1994 58 NA 1 2979. 1 2.5 NA NA 1 NA 0 0 0 1 0 30
## 2 203 1994 24 2 NA 5054. 1 4.5 NA NA 4 NA 0 0 0 0 0 49
## 3 204 1994 NA NA NA 2599. 0 NA NA NA NA NA 0 0 NA NA NA NA
## 4 601 1994 30 NA 0 31259. 1 4 NA NA 3.86 NA 1 2 0 1 0 50
## 5 602 1994 26 1 1 35436. 1 4 NA NA 2.86 NA 1 1 0 1 0 46
## 6 603 1994 43 NA NA 66717. 0 NA NA NA NA NA 0 1 NA NA NA NA
## 7 604 1994 1 NA NA 69202. 0 NA NA NA NA NA 0 1 NA NA 1 38
## 8 901 1994 33 NA 1 13774. 1 3.5 NA NA 3.71 NA 0 0 0 1 0 34
## 9 1101 1994 78 NA 1 1940. 1 3 NA NA 1 NA 0 0 0 1 0 38
## 10 1202 1994 71 NA 1 2989. 1 3 NA NA 1 NA 1 0 0 1 0 29
## # … with 346,911 more rows
2.4.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2} else{F}}
not_all_na <- function(x) any(!is.na(x))
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
}
gsoep_match_imp <- gsoep_match %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
load_fun <- function(year){
load(sprintf("~/Downloads/GSOEP_%s.RData", year))
m
}
gsoep_match_imp <- tibble(year = c(1994, 2002, 2005, 2006, 2007),
imp = map(year, load_fun))
gsoep_match_imp <- gsoep_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>% mutate(nbhdQual = ifelse(is.factor(nbhdQual), as.numeric(as.character(nbhdQual)), nbhdQual))),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
gsoep_match_imp_long <- gsoep_match_imp %>%
select(year, imp_df) %>%
unnest(imp_df) %>%
mutate(alcohol = factor(ifelse(alcohol > 3, 0, 1)))
gsoep_SCA_imp <- gsoep_match_imp %>%
select(p_year = year, imp_sca) %>%
unnest(imp_sca) %>%
mutate(alcohol = ifelse(alcohol > 3, 0, 1)) %>%
full_join(gsoep_SCA %>% select(p_year, SID, BMI))
unique(specifications$name)[!unique(specifications$name) %in% colnames(gsoep_SCA_imp)]
gsoep_match_imp_long <- gsoep_match_imp_long %>%
select(SID, age) %>%
distinct() %>%
group_by(SID) %>%
summarize(age = Mode(age)) %>%
full_join(gsoep_match_imp_long %>% select(-age))
gsoep_SCA_imp <- gsoep_SCA_imp %>%
select(SID, smokes, alcohol, BMI) %>%
mutate_at(vars(smokes), ~as.numeric(as.character(.))) %>%
distinct() %>%
gather(item, value, -SID, na.rm = T) %>%
group_by(SID, item) %>%
summarize(value = max(value)) %>%
ungroup() %>%
spread(item, value) %>%
mutate_at(vars(alcohol, smokes), factor) %>%
full_join(gsoep_SCA_imp %>% select(-smokes, -alcohol, -BMI)) %>%
filter(!is.na(age)) %>%
select(-age) %>%
full_join(gsoep_match_imp_long %>% select(SID, p_year = year, age) %>% distinct())
gsoep_match_imp_long <- gsoep_match_imp_long %>% rename(p_year = year)
gsoep_match <- gsoep_match %>% rename(SID = persnr)
gsoep_out <- gsoep_out %>% rename(SID = persnr)
gsoep_pers <- gsoep_pers %>% rename(SID = persnr)
gsoep_SCA <- gsoep_SCA %>% rename(SID = persnr)
save(gsoep_SCA_imp, gsoep_match_imp_long,
file = sprintf("%s/data/imputed/gsoep_imputed_small.RData", wd))
save(gsoep_match_imp,
file = sprintf("%s/data/imputed/gsoep_imputed.RData", wd))
save(gsoep_alpha, gsoep_pers, gsoep_out, gsoep_match, gsoep_SCA,
file = sprintf("%s/data/clean/gsoep_cleaned.RData", wd))
save(gsoep_long, file = sprintf("%s/data/clean/gsoep_raw_long.RData", wd))
rm(list =ls()[grepl("gsoep", ls())])
2.5 HILDA
The Household Income and Labour Dynamics in Australia (HILDA; Wilkins, Laß, Butterworth, & Vera-Toscano, 2019) study is an ongoing longitudinal study of Australian households. These data are available through application from https://melbourneinstitute.unimelb.edu.au/hilda/for-data-users.
Participants were recruited from more than 17,000 individuals. Data have been collected annually since 2001. The latest data release includes 17 waves of data from 2001 to 2017. More documentation can be found in the HILDA data dictionary at https://www.online.fbe.unimelb.edu.au/HILDAodd/srchSubjectAreas.aspx.
Sample sizes vary by year, ranging from 12,408 (2004) to 17,693 (2016). This provides 99% power to detect a zero-order correlation effect size of ~.03, two tailed at alpha .05.
2.5.1 Load Data
hilda_read_fun <- function(Year, WL){
old.names <- (hilda_codebook %>% filter(year == Year | year == 0))$orig_itemname
sprintf("%s/data/hilda/Combined %s180c.sav", wd, WL) %>% haven::read_sav(.) %>%
select(one_of(old.names))
}
## # A tibble: 1,692 x 17
## study dataset category name itemname wave_letter ...7 year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <lgl> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 hilda NA match alcoh… alcohol <NA> NA 2001 hilda__matc… alsdrink How often … 1=nev… <NA> ifels… NA NA max
## 2 hilda NA match alcoh… alcohol <NA> NA 2002 hilda__matc… blsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 3 hilda NA match alcoh… alcohol <NA> NA 2003 hilda__matc… clsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 4 hilda NA match alcoh… alcohol <NA> NA 2004 hilda__matc… dlsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 5 hilda NA match alcoh… alcohol <NA> NA 2005 hilda__matc… elsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 6 hilda NA match alcoh… alcohol <NA> NA 2006 hilda__matc… flsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 7 hilda NA match alcoh… alcohol <NA> NA 2007 hilda__matc… glsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 8 hilda NA match alcoh… alcohol <NA> NA 2008 hilda__matc… hlsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 9 hilda NA match alcoh… alcohol <NA> NA 2009 hilda__matc… ilsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## 10 hilda NA match alcoh… alcohol <NA> NA 2010 hilda__matc… jlsdrkf Drink alco… 1=nev… <NA> ifels… NA NA max
## # … with 1,682 more rows
2.5.2 Matching Variables
rename_fun <- function(df, Year){
df <- df %>%
haven::zap_labels(.) %>%
gather(key = orig_itemname, value = value, -xwaveid, na.rm=T) %>%
mutate(value=as.numeric(value))
hilda_codebook %>% filter(category == "match" & year == Year) %>%
full_join(df)
}
# rename variables
hilda_match <- hilda %>%
mutate(data = map2(data, year, rename_fun)) %>%
select(-year, -wave_letter) %>%
unnest(data) %>%
distinct() %>%
rename(SID = xwaveid)
yrBrth <- hilda_match %>%
filter(name == "yearBrth") %>%
group_by(SID) %>%
summarize(yearBrth = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth))
# recode
recode_fun <- function(rule, y, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
hilda_match <- hilda_match %>%
filter(name != "yearBrth") %>%
left_join(yrBrth) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
# reverse code
hilda_match <- hilda_match %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
group_by(SID, yearBrth, name, year) %>% # group by person and item (collapse across age)
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
hilda_waves <- p_waves %>% filter(Study == "HILDA") %>% select(Used) %>% distinct()
hilda_match <- hilda_match %>%
filter(year <= max(hilda_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
hilda_match %>%
filter(year <= p_year & comp_rule == rule) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
hilda_match <- crossing(
p_year = hilda_waves$Used,
comp_rule = unique(hilda_match$comp_rule)
) %>%
mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
pivot_wider(names_from = name, values_from = value, names_repair = "unique")
## # A tibble: 82,104 x 58
## p_year SID A C DEP E IQ LOC N `NA` O PA SE SS SWL yearBrth compareHealth grsWages HHsize hlthDecline
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2003 100001 NA NA 0 NA NA 6 NA 1.33 NA 5 NA 4.3 10 1951 3.67 27180 2 4.33
## 2 2003 100002 NA NA 1 NA NA 2.71 NA 2 NA 4 NA 5 7 1952 3.17 27180 2 1.67
## 3 2003 100003 NA NA 0 NA NA 5.71 NA 2 NA 4.5 NA 4.2 7 1952 3.5 14207. 5 2
## 4 2003 100004 NA NA 0 NA NA 4.43 NA 2.67 NA 4.5 NA 5.3 9 1962 3.67 14207. 5 2.67
## 5 2003 100005 NA NA 0 NA NA 7 NA 1 NA 5 NA 7 8 1985 4 14207. 5 1
## 6 2003 100006 NA NA 1 NA NA 5 NA 3 NA 5.5 NA 4.2 9 1987 4.25 14207. 5 3
## 7 2003 100008 NA NA NA NA NA NA NA NA NA NA NA NA 7 1925 3.5 0 2 3
## 8 2003 100009 NA NA NA NA NA NA NA NA NA NA NA NA 4 1925 2 0 2 5
## 9 2003 100010 NA NA 0 NA NA 5.57 NA 1.33 NA 5 NA 5.9 6 1964 4 25000 1 2
## 10 2003 100011 NA NA 1 NA NA 6.14 NA 2.33 NA 4 NA 6.5 6 1969 3.67 0 3 3
## # … with 82,094 more rows, and 38 more variables: loneliness <dbl>, nbhdQual <dbl>, PhysFunc <dbl>, satFam <dbl>, satFinances <dbl>, satHH <dbl>,
## # sickEasy <dbl>, SRhealth <dbl>, alcohol <dbl>, dadOccPrstg <dbl>, dadOcc <dbl>, education <dbl>, employed <dbl>, exercise <dbl>, married <dbl>,
## # numKids <dbl>, urban <dbl>, welfare <dbl>, momOccPrstg <dbl>, momOcc <dbl>, disability <dbl>, ageMarried <dbl>, gender <dbl>, hospital <dbl>,
## # visitDr <dbl>, weight <dbl>, height <dbl>, numSiblng <dbl>, dadAlive <dbl>, momAlive <dbl>, physhlthevnt <dbl>, age <dbl>, BMI <dbl>, dadEdu <dbl>,
## # momEdu <dbl>, parDivorce <dbl>, religion <dbl>, smokes <dbl>
2.5.3 Personality Variables
hilda <- reduce(hilda$data, full_join) %>% haven::zap_labels(.)
hilda_long <- hilda %>%
mutate_all(as.numeric) %>%
pivot_longer(-xwaveid, names_to = "orig_itemname", values_to = "value", values_drop_na = TRUE) %>%
rename(SID = xwaveid)
# keep correct personality waves
hilda_pers <- hilda_codebook %>%
select(-new_itemname) %>%
filter(category == "pers") %>%
left_join(hilda_long) %>%
left_join(p_waves %>% filter(Study == "HILDA") %>% select(name = p_item, Used)) %>%
filter(year %in% Used)
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
hilda_pers <- hilda_pers %>%
select(name, itemname, year, reverse_code:comp_rule, SID, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
hilda_pers <- hilda_pers %>% distinct() %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
hilda_alpha <- hilda_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
# create composites
hilda_pers <- hilda_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "average", comp_rule),
data = map2(data, comp_rule, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))
2.5.4 Outcome Variables
hilda_out <- hilda_codebook %>%
select(-new_itemname) %>%
filter(category == "out" & year != "0") %>%
left_join(hilda_long)
old.names <- unique((hilda_codebook %>% filter(category == "out" & year == "0"))$orig_itemname)
hilda_out_stp <- sprintf("%s/data/hilda/Master r180c.sav", wd) %>%
read_sav() %>%
zap_labels() %>%
select(SID = xwaveid, one_of(old.names)) %>%
gather(orig_itemname, value, one_of(old.names)) %>%
full_join(crossing(orig_itemname = "yodeath", year = hilda_waves$Used)) %>%
left_join(hilda_codebook %>% select(-new_itemname, -year))
# recode
hilda_out <- hilda_out %>%
select(name, itemname, year, reverse_code:comp_rule, SID, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
recode_fun <- function(rule, y, p_year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
hilda_out_stp <- hilda_out_stp %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
# composite within years
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
group_by(SID, name, year) %>% # group by person and item (collapse across age)
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
hilda_waves <- p_waves %>% filter(Study == "HILDA") %>% select(Used) %>% distinct()
hilda_out <- hilda_out %>%
# filter(year <= max(hilda_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "max", comp_rule),
data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value), NA, value))
# composite across years
comp_fun <- function(p_year){
hilda_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
hilda_out <- tibble(p_year = hilda_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data) %>%
full_join(hilda_out_stp %>% select(p_year = year, SID, name, value) %>% mutate(SID = as.numeric(SID)))
2.5.5 Covariates
hilda_match <- hilda_pers %>%
select(-comp_rule) %>%
spread(name, value) %>%
rename(p_year = year) %>%
full_join(hilda_match %>% mutate(SID = as.numeric(SID))) %>%
# physical health event
left_join(hilda_out %>% filter(name == "physhlthevnt") %>% select(p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, p_year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct()
hilda_match <- hilda_match %>%
group_by(SID, p_year) %>%
mutate(age = p_year - yearBrth) %>%
ungroup() %>%
rename(momOccPrstg = momJobPrstg, dadOccPrstg = dadJobPrstg)
hilda_dem <- hilda_match %>%
select(SID, religion, smokes, parDivorce, BMI, momEdu, dadEdu) %>%
gather(key= item, value = value, -SID, na.rm = T) %>%
group_by(SID, item) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
spread(item, value) %>%
mutate(parDivorce = ifelse(is.na(parDivorce), 0, parDivorce))
hilda_match <- hilda_match %>%
select(-religion, -smokes, -parDivorce, -BMI, -momEdu, -dadEdu) %>%
full_join(hilda_dem)
hilda_out <- hilda_out %>%
select(-past, -future) %>%
distinct()
hilda_SCA <- hilda_match %>%
select(SID, p_year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
unique(specifications$name)[!unique(specifications$name) %in% colnames(hilda_SCA_imp)]
2.5.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
mice(df, m = 5, maxit=5, printFlag=TRUE)
}
hilda_match_imp <- hilda_match %>%
rename(NegAff = `NA`) %>%
group_by(SID, year) %>%
mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
select(-momOccPrstg, -dadOccPrstg) %>%
ungroup() %>%
mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
hilda_match_imp <- hilda_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>% mutate(PhysFunc = factor(ifelse(PhysFunc == 1, 0, 1)))),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
hilda_match_imp_long <- hilda_match_imp %>%
select(p_year, imp_df) %>%
unnest(imp_df)
hilda_SCA_imp <- hilda_match_imp %>%
select(p_year, imp_sca) %>%
unnest(imp_sca) %>%
left_join(hilda_SCA %>% select(p_year, SID,
colnames(hilda_SCA)[!colnames(hilda_SCA) %in% colnames(.)])) %>%
mutate(race = factor(0))
unique(specifications$name)[!unique(specifications$name) %in% colnames(hilda_SCA_imp)]
save(hilda_match_imp_long, hilda_SCA_imp,
file = sprintf("%s/data/imputed/hilda_imputed_small.RData", wd))
save(hilda_match_imp,
file = sprintf("%s/data/imputed/hilda_imputed.RData", wd))
save(hilda_alpha, hilda_pers, hilda_out, hilda_match, hilda_SCA,
file = sprintf("%s/data/clean/hilda_cleaned.RData", wd))
save(hilda_long, file = sprintf("%s/data/clean/hilda_raw_long.RData", wd))
rm(list =ls()[grepl("hilda", ls())])
2.6 HRS
The Health and Retirement Study (HRS; Juster & Suzman, 1995) is an ongoing longitudinal study of households in the United States. These data are available at https://hrs.isr.umich.edu by creating a free account.
Participants were recruited from more than 35,000 individuals from the financial households of individuals born between 1931 and 1941 in the US. Data have been collected biannually since 1992. The latest data release includes data up to 2016. On average, 10,000 individuals are sampled each wave More information on the HRS can be found at https://hrs.isr.umich.edu/documentation/survey-design, but, in short, the HRS is a nationally representative sample of adults over 50 in the US. It is critical to note that the HRS samples households of the original cohort and follows individuals and their spouses or partners until their death.
Sample size varies by year, ranging from approximately 7,500 (2014) to 15,500 (1992). (https://hrs.isr.umich.edu/sites/default/files/biblio/ResponseRates_2017.pdf). This provides 99/% power to detect a zero-order correlation effect size of ~.04, two-tailed at alpha .05.
2.6.1 Load Data
hrs_read_fun <- function(year) {
read_da <- function(da, dct, Year){
print(paste(da, dct, year, sep = " "))
data.file <- sprintf("%s/data/hrs/%s/%s", wd, Year, da)
# Set path to the dictionary file "*.DCT"
dict.file <- sprintf("%s/data/hrs/%s/%s", wd, Year, dct)
# Read the dictionary file
df.dict <- read.table(dict.file, skip = 1, fill = TRUE, stringsAsFactors = FALSE)
# Set column names for dictionary dataframe
colnames(df.dict) <- c("col.num","col.type","col.name","col.width","col.lbl")
# Remove last row which only contains a closing }
row <- which(df.dict$col.name == "HHID")
df.dict <- df.dict[-nrow(df.dict),]
if(row == 2){df.dict <- df.dict[-1,]}
# Extract numeric value from column width field
df.dict$col.width <- as.integer(sapply(df.dict$col.width, gsub, pattern = "[^0-9\\.]", replacement = ""))
# Convert column types to format to be used with read_fwf function
df.dict$col.type <- sapply(df.dict$col.type, function(x) ifelse(x %in% c("int","byte","long"), "i", ifelse(x == "float", "n", ifelse(x == "double", "d", "c"))))
# Read the data file into a dataframe
df <- read_fwf(file = data.file, fwf_widths(widths = df.dict$col.width, col_names = df.dict$col.name), col_types = paste(df.dict$col.type, collapse = ""))
# Add column labels to headers
attributes(df)$variable.labels <- df.dict$col.lbl
old.names <- (hrs_codebook %>% filter(year == Year))$orig_itemname
if(any(c("PN", "HHID") %in% colnames(df)) & any(old.names %in% colnames(df))){
# if(any(c("PN", "HHID") %in% colnames(df))){
df <- df %>%
mutate(hhidpn = 1000*as.numeric(HHID) + as.numeric(PN)) %>%
select(one_of(c("PN", "HHID")), one_of(old.names)) %>%
distinct()
# gather(key = item, value = value, -hhidpn)
} else {df <- NA}
return(df)
}
# Set path to the data file "*.DA"
files <- list.files(sprintf("%s/data/hrs/%s", wd, year))
df2 <- tibble(
da = files[grepl(".da", files) | grepl(".DA", files)],
dct = files[grepl(".dct", files) | grepl(".DCT", files)]
) %>%
mutate(data = map2(da, dct, possibly(~read_da(.x, .y, year), NA_real_))) %>%
filter(!is.na(data)) %>%
select(-da, -dct)
if(nrow(df2) != 0){df2$data %>% reduce(full_join) %>% distinct()} else {NA}
}
hrs_codebook <- (codebook %>% filter(study == "HRS"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_upper(orig_itemname)) %>%
mutate_at(vars(orig_itemname, name, itemname), ~str_remove_all(., "[[:space:]]"))
hrs_codebook
## # A tibble: 2,293 x 19
## study dataset category name itemname wave waveletter year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 hrs <NA> match ageM… yearMar… 1 a 1992 hrs__match_… V241 Year First… "191… <NA> ifels… NA NA min
## 2 hrs <NA> match ageM… yearMar… 2 b 1993 hrs__match_… V156 Year First… "191… <NA> ifels… NA NA min
## 3 hrs <NA> match ageM… yearMar… 3 c 1994 hrs__match_… A3-V1 Year First… "191… <NA> ifels… NA NA min
## 4 hrs <NA> match ageM… yearMar… 4 d 1995 hrs__match_… D678 Year First… "191… <NA> ifels… NA NA min
## 5 hrs <NA> match ageM… yearMar… 5 e 1996 hrs__match_… E678 Year First… "191… <NA> ifels… NA NA min
## 6 hrs <NA> match ageM… yearMar… 6 f 1998 hrs__match_… F1073 Year First… "191… <NA> ifels… NA NA min
## 7 hrs <NA> match ageM… yearMar… 7 g 2000 hrs__match_… GB066_1 Year First… "191… <NA> ifels… NA NA min
## 8 hrs <NA> match ageM… yearMar… 8 h 2002 hrs__match_… HB066_1 Year First… "191… <NA> ifels… NA NA min
## 9 hrs <NA> match ageM… yearMar… 9 i 2004 hrs__match_… IB066_1 Year First… "191… <NA> ifels… NA NA min
## 10 hrs <NA> match ageM… yearMar… 10 j 2006 hrs__match_… JB066_1 Year First… "191… <NA> ifels… NA NA min
## # … with 2,283 more rows, and 2 more variables: ...18 <lgl>, ...19 <chr>
old.names <- unique(hrs_codebook$orig_itemname)
hrs.paq <- tibble(year = sprintf("%s/data/hrs", wd) %>% list.files(., pattern = "^[0-9]")) %>%
mutate(data = map(year, hrs_read_fun),
names = map(data, colnames)) %>%
filter(!is.na(data))
old.names <- unique((hrs_codebook %>% filter(dataset == "Rand"))$orig_itemname)
hrs.rand <- sprintf("%s/data/hrs/randhrs1992_2016v1.sav", wd) %>%
haven::read_sav(.) %>%
haven::zap_labels(.) %>%
select(SID = HHIDPN, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, na.rm = T)
hrs_long <- hrs.paq %>%
mutate(data = map(data, ~(.) %>%
gather(key = orig_itemname, value = value, -HHID, -PN))) %>%
select(-names, -year) %>%
unnest(data) %>%
mutate(SID = 1000*as.numeric(HHID) + as.numeric(PN)) %>%
select(-PN, -HHID)
hrs.subs <- unique(hrs_long$SID)[unique(hrs_long$SID) %in% unique(hrs.rand$SID)]
hrs_long <- hrs_long %>%
bind_rows(hrs.rand %>% select(orig_itemname, value, SID)) %>%
filter(SID %in% hrs.subs)
save(hrs.rand, hrs.paq, file = sprintf("%s/data/clean/hrs_raw.RData", wd))
rm(list = c("hrs.paq", "hrs.rand"))
2.6.2 Matching Variables
hrs_waves <- p_waves %>% filter(Study == "HRS") %>% select(Used) %>% distinct()
# rename variables
hrs_match <- hrs_codebook %>%
filter(year <= max(hrs_waves$Used)) %>%
filter(grepl("match", category)) %>%
select(name, itemname, wave, year, orig_itemname, reverse_code:comp_rule) %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(hrs_long)))
yrBrth <- hrs_match %>%
filter(name == "yearBrth") %>%
unnest(data) %>%
mutate(yearBrth = value) %>%
select(SID, yearBrth)
# recode
recode_fun <- function(rule, y, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
hrs_match <- hrs_match %>%
filter(name != "yearBrth") %>%
mutate(data = map(data, ~(.) %>% left_join(yrBrth) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))
# reverse code
hrs_match <- hrs_match %>%
mutate(data = map(data, ~(.) %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, yearBrth, name, year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
hrs_match <- hrs_match %>%
unnest(data) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
hrs_match %>%
filter(year <= p_year & comp_rule == rule) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
distinct()
}
hrs_match <- crossing(
p_year = hrs_waves$Used,
comp_rule = unique(hrs_match$comp_rule)
) %>%
mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
distinct() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
spread(name, value) %>%
filter(!is.na(SID))
2.6.3 Personality Variables
# keep correct personality waves
hrs_pers <- hrs_codebook %>% mutate(reverse_code = tolower(reverse_code)) %>%
select(-new_itemname) %>%
filter(category == "pers") %>%
left_join(hrs_long) %>%
mutate(year = mapvalues(year, seq(2006, 2016, 2), rep(c(2006, 2010, 2014), each = 2))) %>%
left_join(p_waves %>% filter(Study == "HRS") %>% select(name = p_item, Used)) %>%
filter(year %in% Used) %>%
distinct()
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
hrs_pers <- hrs_pers %>%
select(name, itemname, year, reverse_code:comp_rule, SID, Used, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
hrs_pers <- hrs_pers %>% distinct() %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi))) %>%
group_by(name, itemname, year, SID, comp_rule) %>%
summarize(value = mean(value, na.rm = T))
# alpha's
hrs_alpha <- hrs_pers %>%
filter(!is.na(value)) %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% distinct() %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(mean))),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
# create composites
hrs_pers <- hrs_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, comp_fun)) %>%
unnest()
## # A tibble: 314,336 x 5
## name year comp_rule SID value
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 A 2006 average 3010 3
## 2 A 2006 average 3020 3
## 3 A 2006 average 10001010 4
## 4 A 2006 average 10003030 3.4
## 5 A 2006 average 10004010 3.4
## 6 A 2006 average 10004040 3.6
## 7 A 2006 average 10013010 2
## 8 A 2006 average 10013040 2.6
## 9 A 2006 average 10038010 3.4
## 10 A 2006 average 10038040 2.6
## # … with 314,326 more rows
2.6.4 Outcome Variables
hrs_out <- hrs_codebook %>%
select(-new_itemname) %>%
filter(category == "out" & year != 0) %>%
left_join(hrs_long) %>%
distinct() %>%
full_join(crossing(p_year = hrs_waves$Used, name = unique((.)$name)))
hrs_out_stp <- hrs_codebook %>%
select(-new_itemname, -description, scale) %>%
filter(category == "out" & year == 0) %>%
left_join(hrs_long) %>%
full_join(tibble(name = "mortality", SID = unique(hrs_long$SID),
recode = unique(.$recode))) %>%
full_join(crossing(p_year = hrs_waves$Used, name = unique((.)$name)))
# recode
recode_fun <- function(rule, y, p_year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
hrs_out <- hrs_out %>%
select(name, itemname, year, reverse_code:comp_rule, value, SID, p_year) %>%
group_by(recode, p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data)
hrs_out_stp <- hrs_out_stp %>%
select(name, itemname, reverse_code:comp_rule, value, SID, p_year) %>%
mutate(value = ifelse(value > 2050 | value < 0 | is.na(value), 0, value)) %>%
group_by(recode, p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data)
# composite within years
# compositing within years
hrs_out <- hrs_out %>%
group_by(SID, name, year, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))
# composite across years
comp_fun <- function(p_year){
hrs_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
hrs_out <- tibble(p_year = hrs_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data) %>%
full_join(hrs_out_stp %>% select(name, p_year, value, SID)) %>%
filter((!name %in% c("married", "edu")) & !is.na(SID))
## # A tibble: 774,366 x 6
## p_year SID name future past value
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2006 1010 divorced 1 1 NA
## 2 2006 1010 mntlhlthevnt 1 1 NA
## 3 2006 1010 physhlthevnt 1 1 NA
## 4 2006 1010 retired NA 0 0
## 5 2006 1010 unemployed NA 0 0
## 6 2006 2010 divorced 0 0 0
## 7 2006 2010 mntlhlthevnt 1 1 NA
## 8 2006 2010 physhlthevnt 1 1 NA
## 9 2006 2010 retired 1 1 NA
## 10 2006 2010 unemployed 0 0 0
## # … with 774,356 more rows
2.6.5 Covariates
hrs_match <- hrs_pers %>%
select(-comp_rule) %>%
spread(name, value) %>%
full_join(hrs_match %>% rename(year = p_year)) %>%
# left_join(gsoep_bp %>% select(-momID, -dadID)) %>%
# physical health event
left_join(hrs_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct()
hrs_match <- hrs_match %>%
group_by(SID, year) %>%
mutate(age = year - yearBrth) %>%
ungroup()
npb_fun <- function(l, u){
lb <- sort(c(l,u))[1]
ub <- sort(c(l,u))[2]
mean((tibble(occ00 = seq(lb,ub),
npb = mapvalues(occ00, npb$OCC00, npb$NPB, warn_missing = F)) %>%
mutate(npb = ifelse(npb > 100, NA, npb)))$npb, na.rm = T)[1]
}
hrs_npb_groups <-
tribble(
~hrsCat, ~occ80l, ~occ80u, ~occ90l, ~occ90u,
1, 003, 037, 001, 037,
2, 043, 235, 043, 235,
3, 243, 285, 243, 274,
4, 303, 389, 303, 389,
5, 403, 407, 403, 407,
6, 413, 427, 417, 427,
7, 433, 444, 433, 444,
8, 445, 447, 445, 447,
9, 448, 469, 448, 469,
10, 473, 499, 473, 498,
11, 503, 549, 503, 542,
12, 553, 617, 553, 617,
13, 633, 699, 628, 699,
14, 703, 799, 703, 799,
15, 803, 859, 803, 859,
16, 863, 889, 863, 889,
17, 900, 900, 900, 900
) %>%
mutate(occ00l = mapvalues(occ90l, occ90to00$OCC90, occ90to00$OCC00),
occ00u = mapvalues(occ90u, occ90to00$OCC90, occ90to00$OCC00)) %>%
group_by(hrsCat) %>%
mutate(npb = npb_fun(occ00l, occ00u))
hrs_match <- hrs_match %>%
select(SID, year, dadOcc) %>%
filter(!is.na(dadOcc)) %>%
mutate(dadOccPrstg = ifelse(dadOcc == 98, NA, dadOcc),
dadOccPrstg = mapvalues(dadOccPrstg, hrs_npb_groups$hrsCat, hrs_npb_groups$npb)) %>%
select(-dadOcc, -year) %>%
full_join(hrs_match)
hrs_out <- hrs_out %>%
select(-past, -future) %>%
distinct()
hrs_SCA <- hrs_match %>%
select(SID, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
mutate_at(vars(parEdu), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
rename(parOccPrstg = dadOccPrstg) %>%
select(-momEdu, -dadEdu)
unique(specifications$name)[!unique(specifications$name) %in% colnames(hrs_match2)]
## # A tibble: 128,334 x 20
## SID p_year age education gender grsWages race physhlthevnt SRhealth smokes alcohol exercise BMI married numKids parDivorce PhysFunc religion
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.01e7 1992 51 0 1 34932 2 1 2 1 NA 0 48.4 1 4 0 0 1
## 2 1.01e7 1996 55 0 1 24137. 2 1 2 1 0 0.667 47.6 1 4 0 0 1
## 3 1.01e7 2006 65 0 1 20276. 2 1 1.75 1 0 1.75 44.1 1 4 0 1 1
## 4 1.03e7 1996 63 0 0 31800 0 1 4 0 2 0 37.2 1 3 0 0 2
## 5 1.03e7 1992 59 0 0 NA 0 1 NA NA NA NA NA NA 3 0 NA 2
## 6 1.03e7 2006 73 0 0 70108. 0 1 3.8 0 1.8 1.2 35.0 1 3 0 0 2
## 7 1.04e7 1992 54 0 1 139700 0 1 5 1 NA 0 27.4 1 2 0 0 1
## 8 1.04e7 1996 58 0 1 154033. 0 1 4.33 1 7 0.667 27.4 1 2 0 0 1
## 9 1.04e7 2006 68 0 1 306668. 0 1 3.5 1 7 1.88 27.7 1 2 0 0 1
## 10 1.04e7 1992 49 1 1 116000 0 1 4 1 NA 1 26.2 1 3 0 0 2
## # … with 128,324 more rows, and 2 more variables: parOccPrstg <dbl>, parEdu <dbl>
2.6.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
sum_na <- function(x) sum(is.na(x))/length(x)*100
mice_fun <- function(df){
s <- unique(hrs_pers$SID)
df <- df %>% select(-ageMarried, -CurMarYears, -spouseEmployed, -weight, -numHospStays, -SRhealthChng) %>%
filter(SID %in% s)
pos <- apply(df, 1, sum_na) < 60
df <- df %>% filter(pos)
mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
}
hrs_match_imp <- hrs_match %>%
rename(NegAff = `NA`, parOccPrstg = dadOccPrstg) %>%
mutate_at(vars(eduMom, eduDad), ~ifelse(. >= 17 & !is.na(.), 2, ifelse(. >= 15, 1, 0))) %>%
group_by(SID, year) %>%
mutate(momEdu = max(cbind(momEdu, eduMom), na.rm = T),
dadEdu = max(cbind(dadEdu, eduDad), na.rm = T)) %>%
ungroup() %>%
select(-eduMom, -eduDad) %>%
mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
hrs_match_imp <- hrs_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>%
mutate_at(vars(exercise, PhysFunc), ~ifelse(as.numeric(as.character(.)) > 0, 1, 0)) %>%
mutate(PhysFunc = factor(PhysFunc),
satRetire = ifelse(is.factor(satRetire), as.numeric(as.character(satRetire)), satRetire))),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
hrs_match_imp_long <- hrs_match_imp %>%
select(p_year = year, imp_df) %>%
unnest(imp_df) %>%
mutate(alcohol = factor(ifelse(alcohol > 0, 1, 0)))
hrs_SCA_imp <- hrs_match_imp %>%
select(p_year = year, imp_sca) %>%
unnest(imp_sca) %>%
full_join(hrs_SCA %>% select(p_year = year, SID)) %>%
distinct() %>%
mutate(alcohol = factor(ifelse(alcohol > 0, 1, 0)))
hrs_SCA_imp <- hrs_SCA_imp %>%
filter(!is.na(alcohol)) %>%
select(SID, alcohol) %>%
distinct() %>%
full_join(hrs_SCA_imp %>% select(-alcohol)) %>%
filter(!is.na(age))
unique(specifications$name)[!unique(specifications$name) %in% colnames(hrs_SCA_imp)]
save(hrs_match_imp_long, hrs_SCA_imp,
file = sprintf("%s/data/imputed/hrs_imputed_small.RData", wd))
save(hrs_match_imp,
file = sprintf("%s/data/imputed/hrs_imputed.RData", wd))
save(hrs_alpha, hrs_pers, hrs_out, hrs_match, hrs_SCA,
file = sprintf("%s/data/clean/hrs_cleaned.RData", wd))
save(hrs_long, file = sprintf("%s/data/clean/hrs_raw_long.RData", wd))
rm(list =ls()[grepl("hrs", ls())])
2.7 LISS
The Longitudinal Studies for the Social sciences (LISS; Scherpenzeel, Das, Ester, & Kaczmirek, 2010) is an ongoing longitudinal study of households in the Netherlands. These data are online, through application, from https://statements.centerdata.nl/liss-panel-data-statement.
Participants were approximately 8,000 Dutch-speaking individuals permanently residing in the Netherlands from 5,000 households. Data have been collected annually since 2007. The latest data release includes 11 waves of data from 2008 to 2018. More documentation are available at https://www.dataarchive.lissdata.nl/study_units/view/1.
Sample sizes vary by year, ranging from 5,021 (2018) to 6808 (2008). This provides 99/% power to detect a correlation effect size of ~.04, two-tailed at alpha .05.
2.7.1 Load Data
liss_read_fun <- function(x){
sprintf("%s/data/liss/%s", wd, x) %>% haven::read_sav(.) %>% select(one_of(old.names))
}
## # A tibble: 3,688 x 18
## study dataset category name itemname wave wave_letter year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 liss avars_… match age YOB1 1 a 2008 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 2 liss avars_… match age YOB1 2 b 2009 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 3 liss avars_… match age YOB1 3 c 2010 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 4 liss avars_… match age YOB1 4 d 2011 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 5 liss avars_… match age YOB1 5 e 2012 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 6 liss avars_… match age YOB1 6 f 2013 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 7 liss avars_… match age YOB1 7 g 2014 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 8 liss avars_… match age YOB1 8 h 2015 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 9 liss avars_… match age YOB1 9 i 2016 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## 10 liss avars_… match age YOB1 10 j 2017 <NA> gebjaar Year of Bi… nume… <NA> year … NA NA mode
## # … with 3,678 more rows, and 1 more variable: LHQ <chr>
old.names <- unique(liss_codebook$orig_itemname) %>% str_to_lower
datasets <- sprintf("%s/data/liss", wd) %>% list.files()
liss <- tibble(datasets = datasets) %>%
mutate(data = map(datasets, liss_read_fun))
liss <- reduce(liss$data, full_join) %>% haven::zap_labels(.)
save(liss, file = sprintf("%s/data/clean/liss_raw.RData", wd))
avars <- tibble(ds = datasets[grepl("avar", datasets)]) %>%
mutate(data = map(ds, ~sprintf("%s/data/liss/%s", wd, .) %>% haven::read_sav(.) %>% select(one_of(old.names)) %>% haven::zap_labels(.))) %>%
separate(ds, c("ds", "year", "scrap1", "scrap2"), sep = "_") %>%
separate(year, c("year", "month"), -2) %>%
select(year, month, data) %>%
unnest(data)
2.7.2 Matching Variables
rename_fun <- function(cb, var){
old.names <- unique((liss_codebook %>% filter(name == var))$orig_itemname)
df <- liss %>%
select(SID = nomem_encr, HHID = nohouse_encr, one_of(old.names)) %>%
gather(key = orig_itemname, value = value,
-SID, -HHID, na.rm=T)
if(length(old.names) > 1){
df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
} else {
df %>% left_join(cb %>% select(-(itemname:year), -new_itemname, -dataset) %>% distinct()) %>% mutate(year = 0)
}
}
# rename variables
liss_match <- liss_codebook %>%
filter(category == "match") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct()
# bring in year or birth for cleaning
liss_match <- liss_match %>%
filter(name != "age") %>%
left_join(
liss_match %>% filter(name == "age") %>%
mutate(yearBrth = value) %>%
select(SID, yearBrth)
) %>% distinct()
# recode
recode_fun <- function(rule, y, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
liss_match <- liss_match %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
# reverse code
liss_match <- liss_match %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, HHID, yearBrth, name, year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
liss_waves <- p_waves %>% filter(Study == "LISS") %>% select(Used) %>% distinct()
liss_match <- liss_match %>%
filter(year <= max(liss_waves$Used) |
name %in% c("dadEdu", "momEdu", "momOccPrstg", "dadOccPrstg")) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
liss_match %>%
filter(year <= p_year & comp_rule == rule |
name %in% c("dadEdu", "momEdu", "momOccPrstg", "dadOccPrstg")) %>%
group_by(SID, HHID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
liss_match <- crossing(
p_year = liss_waves$Used,
comp_rule = unique(liss_match$comp_rule)
) %>%
mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = max))
## # A tibble: 27,382 x 62
## SID HHID year A C DEP E IQ N `NA` O PA SE SS SWL yearBrth HHsize dadEdu dadOccPrstg momEdu momOccPrstg
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 800033 583404 2008 3 2 NA 2.2 NA 3.5 3.3 3.29 4.2 5.38 2 6 1991 4.5 NA NA NA NA
## 2 800042 500277 2008 3.5 4 2.67 3.2 NA 3 2.1 3.43 5.7 4.88 2.08 6.17 1975 4 0 7 0 8
## 3 800045 548654 2008 NA NA 1 NA NA NA NA NA NA NA NA NA 1942 2 NA NA NA NA
## 4 800057 580532 2008 2.83 2.67 1 4.2 NA 4.5 1.4 4.57 4.5 6.75 2 6.17 1975 4 0 6 1 NA
## 5 800076 578048 2008 3.17 3.5 2.33 3 4 3 2.5 3.14 5.6 4.5 1.3 6.5 1985 3 NA NA NA NA
## 6 800119 537783 2008 2.83 3.33 1.33 3.4 NA 3 NA 3.14 NA 5.75 NA 4.67 1950 4 0 9 NA NA
## 7 800125 582101 2008 4 4 4.33 2.4 NA 2 3.3 4.29 4.3 4.5 0 3.83 1958 1.5 NA NA NA NA
## 8 800134 549826 2008 4.67 2.67 1.67 4.8 NA 4 1.8 3.43 5.4 5.62 NA 5.83 1930 2 NA NA NA NA
## 9 800155 545016 2008 NA NA NA NA NA NA NA NA NA NA 1.2 NA 1955 1 NA NA NA NA
## 10 800158 519049 2008 3.67 3.67 1 2.4 NA 4 1.3 3 3.2 5.5 2 5.83 1969 7 NA NA NA NA
## # … with 27,372 more rows, and 41 more variables: eatingHabits <dbl>, loneliness <dbl>, physAct <dbl>, PhysFunc <dbl>, satFnc <dbl>, satLeisure <dbl>,
## # satSchl <dbl>, satWgs <dbl>, SRhealth <dbl>, survAtt <dbl>, exercise <dbl>, satFam <dbl>, satHH <dbl>, alcohol <dbl>, alcoholType <dbl>,
## # disability <dbl>, drugs <dbl>, drVisits <dbl>, employed <dbl>, ethnicity <dbl>, numSibling <dbl>, religion <dbl>, smokes <dbl>, unempBen <dbl>,
## # welfare <dbl>, gender <dbl>, urban <dbl>, ageMarried <dbl>, married <dbl>, dadAlive <dbl>, diet <dbl>, height <dbl>, momAlive <dbl>,
## # parDivorce <dbl>, weight <dbl>, grsWages <dbl>, numKids <dbl>, education <dbl>, physhlthevnt <dbl>, age <dbl>, BMI <dbl>
2.7.3 Personality Variables
liss_pers <- liss_codebook %>%
filter(category == "pers") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
left_join(p_waves %>% filter(Study == "LISS") %>% select(name = p_item, Used)) %>%
filter(year %in% Used)
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
liss_pers <- liss_pers %>%
select(name:HHID, itemname, year, reverse_code:comp_rule, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
liss_pers <- liss_pers %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
liss_alpha <- liss_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
# create composites
liss_pers <- liss_pers %>%
group_by(SID, HHID, name, year) %>%
summarize(value = mean(value, na.rm = T))
2.7.4 Outcome Variables
rename_fun <- function(cb, var){
old.names <- unique((liss_codebook %>% filter(name == var))$orig_itemname)
df <- liss %>%
select(SID = nomem_encr, HHID = nohouse_encr, one_of(old.names)) %>%
gather(key = orig_itemname, value = value,
-SID, -HHID) %>%
left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
}
liss_out <- liss_codebook %>%
filter(category == "out") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
full_join(crossing(p_year = liss_waves$Used, name = unique((.)$name)))
liss_out <- avars %>%
select(year, itemname = month, SID = nomem_encr, divorced = burgstat, unemployed = belbezig) %>%
gather(key = name, value = value, divorced, unemployed) %>%
distinct() %>%
mutate(year = as.numeric(year), p_year = 2008) %>%
left_join(liss_codebook %>% filter(name %in% c("divorced", "unemployed")) %>%
select(name, recode, reverse_code, mini, maxi) %>% distinct()) %>%
full_join(liss_out %>% filter(!(orig_itemname %in% c("burgstat", "belbezig"))))
# recode
recode_fun <- function(rule, y, p_year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
liss_out <- liss_out %>%
select(year:maxi, p_year) %>%
# select(name:HHID, itemname, year, reverse_code:comp_rule, value, p_year) %>%
# filter(name %in% c("married", "mvInPrtnr")) %>%
group_by(recode, p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data) %>% distinct()
liss_waves <- p_waves %>% filter(Study == "LISS") %>% select(Used) %>% distinct()
liss_out <- liss_out %>%
# filter(year <= max(gsoep_waves$Used)) %>%
group_by(SID, name, year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))
# composite across years
comp_fun <- function(p_year){
liss_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
liss_out <- tibble(p_year = liss_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data)
2.7.5 Covariates
liss_match <- liss_pers %>%
spread(name, value) %>%
full_join(liss_match %>% rename(year = p_year)) %>%
# left_join(gsoep_bp %>% select(-momID, -dadID)) %>%
# physical health event
left_join(liss_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct() %>%
ungroup()
liss_match <- liss_match %>%
group_by(SID, HHID, year) %>%
mutate(age = year - yearBrth,
BMI = weight / ((height/100)^2)) %>%
ungroup()
liss_out <- liss_out %>%
select(-past, -future) %>%
distinct()
liss_SCA <- liss_match %>%
select(SID = SID, p_year = year, race = ethnicity, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
unique(specifications$name)[!unique(specifications$name) %in% colnames(liss_match2)]
2.7.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
}
liss_match_imp <- liss_match %>%
rename(NegAff = `NA`, race = ethnicity) %>%
group_by(SID, year) %>%
mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
select(-momOccPrstg, -dadOccPrstg, -HHID) %>%
ungroup() %>%
mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
liss_match_imp <- liss_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>% mutate(PhysFunc = factor(ceiling(PhysFunc)))),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
liss_match_imp_long <- liss_match_imp %>%
select(p_year = year, imp_df) %>%
unnest(imp_df)
liss_SCA_imp <- liss_match_imp %>%
select(p_year = year, imp_sca) %>%
unnest(imp_sca) %>%
left_join(liss_SCA %>% select(p_year=year, SID,
colnames(liss_SCA)[!colnames(liss_SCA) %in% colnames(.)]))
save(liss_match_imp_long, liss_SCA_imp,
file = sprintf("%s/data/imputed/liss_imputed_small.RData", wd))
save(liss_match_imp,
file = sprintf("%s/data/imputed/liss_imputed.RData", wd))
save(liss_alpha, liss_pers, liss_out, liss_match, avars, liss_SCA,
file = sprintf("%s/data/clean/liss_cleaned.RData", wd))
rm(list =ls()[grepl("liss", ls())])
2.8 MIDUS
The Midlife in the United States (MIDUS; Brim, Ryff, & Kessler, 2004; Ryff et al., 2012, 2016) study is an ongoing longitudinal study of adults in the United States. These data are available at http://www.icpsr.umich.edu by making a free account.
Participants included more than 10,000 individuals aged 25 or older from the United States. The present study uses data from MIDUS I, II, and III. MIDUS I was collected in 1995-1996. MIDUS II was the follow-up to MIDUS I and was collected from 2004-2006. MIDUS III was an additional follow-up conducted from 2013-2014. More information can be found at http://midus.wisc.edu/findings/Understanding_Data_Collection_in_MIDUS.pdf.
Sample size varies by wave, with 7,108 (MIDUS I), 4,963 (MIDUS II), 3,294 (MIDUS III). This provides 99/% power to detect a zero-order correlation effect size of ~.06, two-tailed at alpha .05.
2.8.1 Load Data
loadRData <- function(fileName){
#loads an RData file, and returns it
load(fileName)
get(ls()[ls() != "fileName"])
}
midus_read_fun <- function(x){
sprintf("%s/data/midus/%s", wd, x) %>% loadRData(.) %>%
select(one_of(old.names)) %>%
tbl_df
}
## # A tibble: 1,051 x 16
## study dataset category name itemname wave year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 midus M2P1 match achvm… hghStndr… M2P1 2004 midus__match… B1SE7FF Set very high… "1\tTRUE OF… no ifelse… 1 4 <NA>
## 2 midus M3P1 match achvm… hghStndr… M3P1 2013 midus__match… C1SE7FF Set very high… "-1\tRESPON… no ifelse… 1 4 <NA>
## 3 midus M2P1 match achvm… keepWrki… M2P1 2004 midus__match… B1SE7L Keep working … "1\tTRUE OF… no ifelse… 1 4 <NA>
## 4 midus M3P1 match achvm… keepWrki… M3P1 2013 midus__match… C1SE7L Keep working … "-1\tRESPON… no ifelse… 1 4 <NA>
## 5 midus M2P1 match achvm… likeDiff M2P1 2004 midus__match… B1SE7O I like to try… "1\tTRUE OF… no ifelse… 1 4 <NA>
## 6 midus M3P1 match achvm… likeDiff M3P1 2013 midus__match… C1SE7O I like to try… "-1\tRESPON… no ifelse… 1 4 <NA>
## 7 midus M2P1 match achvm… likeWrk M2P1 2004 midus__match… B1SE7R I like hard w… "1\tTRUE OF… no ifelse… 1 4 <NA>
## 8 midus M3P1 match achvm… likeWrk M3P1 2013 midus__match… C1SE7R I like hard w… "-1\tRESPON… no ifelse… 1 4 <NA>
## 9 midus M1P1 match age age M1P1 1994 midus__match… A1PAGE_M2 M1 age comput… "numeric" <NA> <NA> NA NA <NA>
## 10 midus M2P1 match age age M2P1 2004 midus__match… B1PAGE_M2 M1 age comput… "numeric" <NA> <NA> NA NA <NA>
## # … with 1,041 more rows
old.names <- unique(midus_codebook$orig_itemname) %>% str_to_upper
datasets <- sprintf("%s/data/midus", wd) %>% list.files(., pattern = ".rda")
midus <- tibble(datasets = datasets) %>%
mutate(data = map(datasets, midus_read_fun),
ncol = map_dbl(data, ncol)) %>%
filter(ncol != 0)
midus <- reduce(midus$data, full_join) %>% haven::zap_labels(.)
midus <- midus %>%
mutate_if(is.factor, ~as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", .)))
save(midus, file = sprintf("%s/data/clean/midus_raw.RData", wd))
2.8.2 Matching Variables
rename_fun <- function(cb, var){
old.names <- unique((midus_codebook %>% filter(name == var))$orig_itemname)
df <- midus %>%
select(SID = M2ID, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID) %>%
mutate(value = as.numeric(value))
if(length(old.names) > 1){
df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
} else {
df %>% left_join(cb %>% select(-(itemname:year), -new_itemname, -dataset) %>% distinct()) %>% mutate(year = 0)
}
}
# rename variables
midus_match <- midus_codebook %>%
filter(category == "match") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct()
# bring in year or birth for cleaning
yrBrth <- midus_match %>%
filter(name == "age") %>%
mutate(yearBrth = year - value) %>%
group_by(SID) %>%
summarize(yearBrth = max(yearBrth, na.rm = T)) %>%
ungroup() %>%
mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth))
# recode
recode_fun <- function(rule, y, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
midus_match <- midus_match %>%
left_join(yrBrth) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
# reverse code
midus_match <- midus_match %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, yearBrth, name, year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
midus_waves <- p_waves %>% filter(Study == "MIDUS") %>% select(Used) %>% distinct()
midus_match <- midus_match %>%
filter(year <= max(midus_waves$Used)) %>%
mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
midus_match %>%
filter(year <= p_year & comp_rule == rule) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
midus_match <- crossing(
p_year = midus_waves$Used,
comp_rule = unique(midus_match$comp_rule)
) %>%
mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = max))
2.8.3 Personality Variables
midus_pers <- midus_codebook %>%
filter(category == "pers") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
left_join(p_waves %>% filter(Study == "MIDUS") %>% select(name = p_item, Used)) %>%
filter(year %in% Used & !is.na(value))
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
midus_pers <- midus_pers %>%
select(name, SID, itemname, year, reverse_code:comp_rule, value) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
midus_pers <- midus_pers %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
midus_alpha <- midus_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
# create composites
midus_pers <- midus_pers %>%
group_by(SID, name, year) %>%
summarize(value = mean(value, na.rm = T))
## # A tibble: 118,712 x 4
## # Groups: SID, name [78,545]
## SID name year value
## <dbl> <chr> <dbl> <dbl>
## 1 10001 A 1994 3
## 2 10001 A 2004 2.6
## 3 10001 C 1994 2.75
## 4 10001 C 2004 3.25
## 5 10001 E 1994 2.8
## 6 10001 E 2004 2.8
## 7 10001 LOC 1994 5.5
## 8 10001 LOC 2004 4.75
## 9 10001 N 1994 2
## 10 10001 N 2004 2.25
## # … with 118,702 more rows
2.8.4 Outcome Variables
rename_fun <- function(cb, var){
old.names <- unique((midus_codebook %>% filter(name == var))$orig_itemname)
df <- midus %>%
select(SID = M2ID, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID) %>%
left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
}
midus_out <- midus_codebook %>%
filter(category == "out") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
full_join(crossing(p_year = midus_waves$Used, name = unique((.)$name)))
# recode
recode_fun <- function(rule, y, p_year, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
midus_out <- midus_out %>%
select(name, SID, value:year, recode, p_year) %>%
left_join(yrBrth) %>%
group_by(recode, p_year, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
unnest(data) %>% distinct()
midus_out <- midus_out %>% filter(name != "mvInPrtnr") %>%
select(-recode) %>%
full_join(
midus_out %>% filter(name %in% c("mvInPrtnr", "married")) %>%
group_by(p_year, name, SID, year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value)) %>%
spread(name, value) %>%
mutate(name = "mvInPrtnr",
value = ifelse((married == 0 | is.na(married)) & mvInPrtnr == 1
& !is.na(mvInPrtnr), 1, 0)) %>%
select(-married, -mvInPrtnr)
)
midus_waves <- p_waves %>% filter(Study == "MIDUS") %>% select(Used) %>% distinct()
midus_out <- midus_out %>%
# filter(year <= max(gsoep_waves$Used)) %>%
group_by(SID, name, year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))
# composite across years
comp_fun <- function(p_year){
midus_out %>%
mutate(group = ifelse(year > p_year, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
midus_out <- tibble(p_year = midus_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data)
## # A tibble: 202,405 x 4
## p_year SID name value
## <dbl> <dbl> <chr> <dbl>
## 1 1994 10001 chldbrth NA
## 2 1994 10001 crim 0
## 3 1994 10001 divorced NA
## 4 1994 10001 edu NA
## 5 1994 10001 frstjob NA
## 6 1994 10001 married NA
## 7 1994 10001 mntlhlthevnt NA
## 8 1994 10001 mortality 0
## 9 1994 10001 mvInPrtnr 0
## 10 1994 10001 physhlthevnt 1
## # … with 202,395 more rows
2.8.5 Covariates
midus_match <- midus_pers %>%
spread(name, value) %>%
full_join(midus_match %>% rename(year = p_year)) %>%
# physical health event
left_join(midus_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct() %>%
ungroup()
midus_out <- midus_out %>%
select(-past, -future) %>%
distinct()
midus_SCA <- midus_match %>%
select(SID, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
unique(specifications$name)[!unique(specifications$name) %in% colnames(midus_match)]
## # A tibble: 15,184 x 20
## SID p_year age education gender grsWages race physhlthevnt SRhealth smokes alcohol exercise BMI married numKids parDivorce PhysFunc religion
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 10001 1994 53 1 0 300000 0 0 7 1 1 1 26.5 1 3 NA 0 2
## 2 10001 2004 53 1 0 300000 0 1 7 1 1 1 26.4 1 3 NA 0 2
## 3 10002 1994 60 2 0 NA NA 0 NA NA 1 NA NA 1 2 NA <NA> NA
## 4 10002 2004 60 2 0 235500 NA 1 8 NA 1 2.4 24.1 1 2 NA 0 1
## 5 10004 1994 69 2 0 54000 0 0 7 1 0 4.5 21.4 1 6 NA 0 1
## 6 10005 1994 70 0 1 27500 0 1 9 0 0 3.5 26.5 1 2 NA 1 0
## 7 10005 2004 70 0 1 13750 0 1 8.5 0 0 3.25 26.5 1 2 NA 1 0
## 8 10006 1994 51 1 1 34000 0 1 8 0 0 3.5 26.9 1 2 NA 1 1
## 9 10006 2004 51 1 1 34000 0 1 8 1 0 3.5 26.9 1 2 NA 1 1
## 10 10007 1994 35 0 1 46500 1 0 7 1 0 3.5 24.4 1 2 1 0 1
## # … with 15,174 more rows, and 2 more variables: parEdu <dbl>, parOccPrstg <dbl>
2.8.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
mice(df, m = 5, maxit=5, printFlag=TRUE)
}
midus_match_imp <- midus_match %>%
rename(NegAff = `NA`) %>%
group_by(SID, year) %>%
mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
select(-momOccPrstg, -dadOccPrstg) %>%
ungroup() %>%
mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
midus_match_imp <- midus_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>% mutate(PhysFunc = factor(ifelse(PhysFunc < 2, 1, 0)))),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
midus_match_imp_long <- midus_match_imp %>%
select(p_year = year, imp_df) %>%
unnest(imp_df)
midus_SCA_imp <- midus_match_imp %>%
select(p_year = year, imp_sca) %>%
unnest(imp_sca) %>%
left_join(midus_SCA %>% select(p_year, SID,
colnames(midus_SCA)[!colnames(midus_SCA) %in% colnames(.)]))
unique(specifications$name)[!unique(specifications$name) %in% colnames(midus_SCA_imp)]
save(midus_match_imp_long, midus_SCA_imp,
file = sprintf("%s/data/imputed/midus_imputed_small.RData", wd))
save(midus_match_imp,
file = sprintf("%s/data/imputed/midus_imputed.RData", wd))
save(midus_alpha, midus_pers, midus_out, midus_match, midus_SCA,
file = sprintf("%s/data/clean/midus_cleaned.RData", wd))
rm(list =ls()[grepl("midus", ls())])
2.9 NLSY
The Children to Young Adults Study (CNLSY; Bureau of Labor Statistics, 2017) is an offshoot study of the National Longitudinal Study of Youth (NLSY79), which is an ongoing longitudinal, nationally representative study in the United States. These data are available on the National Bureau of Labour Statistics website dedicated to the NLSY studies by creating a free account (https://www.nlsinfo.org/investigator/pages/login).
Participants included more than 12,500 individuals in the United States that began in 1979. The CNLSY includes the biological children of the NLSY79 participants and began in 1986. Children (10 years and older) completed separate inventories from children (or “young adults”) aged 15 and above. Mothers of children 10 and below also completed surveys on the children prior to age 10. All participants were interviewed in addition to surveys.
Sample sizes vary by year, ranging from approximately 1,331 (1979) to 11,530 (2016). This provides 99/% power to detect a zero-order correlation effect size of ~.05.
2.9.1 Load Data
## # A tibble: 5,339 x 21
## study dataset category name itemname year new_itemname orig_itemname QNAME description scale reverse_code recode mini maxi comp_rule item_rule
## <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr>
## 1 nlsy NA match acti… Activit… 2002 nlsy__match… Y1263100 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 2 nlsy NA match acti… Activit… 2004 nlsy__match… Y1496800 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 3 nlsy NA match acti… Activit… 2006 nlsy__match… Y1746700 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 4 nlsy NA match acti… Activit… 2008 nlsy__match… Y2027500 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 5 nlsy NA match acti… Activit… 2010 nlsy__match… Y2352600 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 6 nlsy NA match acti… Activit… 2012 nlsy__match… Y2682100 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 7 nlsy NA match acti… Activit… 2014 nlsy__match… Y3037500 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 8 nlsy NA match acti… Outside… 2002 nlsy__match… Y1263200 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 9 nlsy NA match acti… Outside… 2004 nlsy__match… Y1496900 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## 10 nlsy NA match acti… Outside… 2006 nlsy__match… Y1746800 Q4-3… DOES R BEL… 1 = … <NA> ifels… NA NA max max
## # … with 5,329 more rows, and 4 more variables: Include <lgl>, ...19 <lgl>, ...20 <lgl>, ...21 <lgl>
## # A tibble: 88 x 18
## study dataset category name itemname year new_itemname orig_itemname QNAME description scale reverse_code recode mini maxi comp_rule item_rule
## <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <lgl> <lgl> <chr> <chr>
## 1 nlsy NA match momO… momOcc 1982 <NA> R0828000 CPSO… OCCUPATION… <NA> no ifels… NA NA max max
## 2 nlsy NA match momO… momOcc 1983 <NA> R0945300 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 3 nlsy NA match momO… momOcc 1984 <NA> R1255700 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 4 nlsy NA match momO… momOcc 1985 <NA> R1650500 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 5 nlsy NA match momO… momOcc 1986 <NA> R1923100 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 6 nlsy NA match momO… momOcc 1987 <NA> R2317900 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 7 nlsy NA match momO… momOcc 1988 <NA> R2525700 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 8 nlsy NA match momO… momOcc 1989 <NA> R2924700 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 9 nlsy NA match momO… momOcc 1990 <NA> R3127400 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## 10 nlsy NA match momO… momOcc 1991 <NA> R3523100 CPSO… OCCUPATION… <NA> <NA> ifels… NA NA max max
## # … with 78 more rows, and 1 more variable: Include <lgl>
# CNLSY codebook
cnlsy_codebook %>% select(QNAME, year) %>%
write_delim(path = sprintf("%s/codebooks/meta.CHILDYA", wd), delim = ",", col_names = F)
# NLSY codebook
nlsy_codebook %>% select(orig_itemname) %>%
write_delim(path = sprintf("%s/codebooks/meta.NLSY79", wd), delim = ",", col_names = F)
cnlsy_codebook <- cnlsy_codebook %>% full_join(nlsy_codebook)
nlsy <- sprintf("%s/data/nlsy/cnlsy.csv", wd) %>% read_csv
mnlsy <- sprintf("%s/data/nlsy/nlsy79.csv", wd) %>% read_csv
nlsy <- nlsy %>% left_join(mnlsy %>% rename(C0000200 = R0000100))
rm(mnlsy)
2.9.2 Matching Variables
rename_fun <- function(cb, var){
old.names <- unique((cnlsy_codebook %>% filter(name == var))$orig_itemname)
df <- nlsy %>%
select(SID = C0000100, yearBrth = C0005700, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -yearBrth,
-SID, na.rm=T)
if(length(old.names) > 1){
df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:item_rule))
} else {
df %>% left_join(cb %>% select(-(itemname:year), -new_itemname, -dataset) %>% distinct()) %>% mutate(year = "0")
}
}
# rename variables
nlsy_match <- cnlsy_codebook %>%
filter(category == "match" & year != "XRND") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
filter(name != "yearBrth") %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct()
# single time point variables
old.names <- (cnlsy_codebook %>% filter(category == "match" & year == "XRND"))$orig_itemname
nlsy_match_stp <- nlsy %>%
select(SID = C0000100, yearBrth = C0005700, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, -yearBrth, na.rm=T) %>%
left_join(cnlsy_codebook %>% select(name:year, orig_itemname, recode, comp_rule, item_rule))
# recode
recode_fun <- function(rule, y, p_year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
nlsy_match <- nlsy_match %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
nlsy_waves <- p_waves %>% filter(Study == "NLSY") %>% select(Used) %>% distinct()
nlsy_match_stp <- nlsy_match_stp %>%
full_join(crossing(p_year = nlsy_waves$Used, year = "XRND")) %>%
group_by(recode, p_year, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data)
# reverse code
nlsy_match <- nlsy_match %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
occ_recode_fun <- function(year, df){
if(year == 2000){df %>% mutate(value = mapvalues(value, npb$OCC00, npb$NPB))}
else if(year == 1990){
df %>% mutate(value = mapvalues(value, occ90to00$OCC90, occ90to00$OCC00),
value = mapvalues(value, npb$OCC00, npb$NPB),
value = ifelse(value > 100, NA, value))
} else{
df %>% mutate(value = mapvalues(value, occ70to90$OCC70, occ70to90$OCC90),
value = mapvalues(value, occ90to00$OCC90, occ90to00$OCC00),
value = mapvalues(value, npb$OCC00, npb$NPB),
value = ifelse(value > 100, NA, value))
}
}
nlsy_match_occ <- nlsy_match %>%
filter(grepl("Occ", name)) %>%
filter(!is.na(value) & year >= 1984) %>%
mutate(group = ifelse(year < 2000, 1970, ifelse(year >= 2002, 2000, 1990))) %>%
group_by(group) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(group, data, occ_recode_fun)) %>%
unnest(data) %>%
mutate(name = paste(name, "Prstg", sep = "")) %>%
select(-group)
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T),
multiply = prod(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, yearBrth, name, year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
nlsy_match <- nlsy_match %>%
filter(year <= max(nlsy_waves$Used) & !grepl("Occ", name)) %>%
full_join(nlsy_match_occ) %>%
group_by(comp_rule, item_rule) %>%
nest() %>%
ungroup() %>%
mutate(item_rule = ifelse(is.na(item_rule), "skip", item_rule),
data = map2(data, item_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
nlsy_match %>%
filter(year <= p_year & comp_rule == rule) %>%
full_join(
nlsy_match_stp %>%
select(SID, yearBrth, p_year, name, value, comp_rule) %>%
filter(comp_rule == rule & p_year == p_year)
) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
nlsy_match <- crossing(
p_year = nlsy_waves$Used,
comp_rule = unique(nlsy_match$comp_rule)
) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = max))
## # A tibble: 34,590 x 120
## # Groups: SID [11,530]
## SID dadEdu momEdu yearBrth year A C DEP E IQ LOC N O SE SS activity AgeBegDate batheSelf childhoodSS chores
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 201 NA 1 1993 2000 NA NA NA NA 17.6 NA NA NA NA NA NA NA 1 112. 2.75
## 2 201 NA 1 1993 2006 NA NA NA NA NA NA NA NA NA NA NA NA 3 118. 3.55
## 3 201 NA 1 1993 2008 NA NA NA NA NA NA NA NA NA NA NA NA 3 118. 3.55
## 4 202 NA 1 1994 2000 NA NA NA NA 16.3 NA NA NA NA NA NA NA NA 103. NA
## 5 202 NA 1 1994 2006 NA NA NA NA NA NA NA NA NA NA NA NA 5 108. 3.9
## 6 202 NA 1 1994 2008 NA NA NA NA NA NA NA NA NA NA NA NA 5 108. 4.18
## 7 301 NA 1 1981 2008 2 3.5 0.571 2.5 NA 1.14 0.5 3.5 1.7 3.67 NA 12.5 3 112. 4.53
## 8 301 NA 1 1981 2000 NA NA NA NA NA NA NA NA NA NA NA 12.5 3 112. 4.53
## 9 301 NA 1 1981 2006 NA NA NA NA NA NA NA NA NA NA NA 12.5 3 112. 4.53
## 10 302 NA 1 1983 2000 NA NA NA NA NA NA NA NA NA 0 NA 11.5 5 97.5 4.75
## # … with 34,580 more rows, and 100 more variables: compliance <dbl>, difficulty <dbl>, discipline <dbl>, fearfulness <dbl>, foodChoice <dbl>,
## # FreqSmk <dbl>, genRoles <dbl>, grsWages <dbl>, hsGrades <dbl>, Impls <dbl>, insecureAttach <dbl>, manageTIme <dbl>, nbhdQual <dbl>,
## # parAffection <dbl>, parLimitations <dbl>, parTalkTV <dbl>, peerPressure <dbl>, positiveAffect <dbl>, praisedByPar <dbl>, reads <dbl>, relDad <dbl>,
## # relMom <dbl>, RelProb <dbl>, relSat <dbl>, satJob <dbl>, satSchl <dbl>, SensSeek <dbl>, SRhealth <dbl>, timeHW <dbl>, timeWOthrs <dbl>,
## # TVtime <dbl>, ADHD <dbl>, alcohol <dbl>, behProb <dbl>, BPI <dbl>, crprlPunish <dbl>, dadAlive <dbl>, dropOutSchl <dbl>, drugs <dbl>,
## # dyslexia <dbl>, edu <dbl>, education <dbl>, employed <dbl>, familyProb <dbl>, height <dbl>, hlthcare <dbl>, hlthProbs <dbl>, Jail <dbl>,
## # learnDis <dbl>, learningDis <dbl>, married <dbl>, momAge <dbl>, momAlcPreg <dbl>, momAlive <dbl>, momCocPreg <dbl>, momOccPrstg <dbl>,
## # momRedAlcPreg <dbl>, momRedCalPreg <dbl>, momRedSaltPreg <dbl>, momRedSmkPreg <dbl>, momSmkPreg <dbl>, momVitPreg <dbl>, momWeedPreg <dbl>,
## # musicInst <dbl>, nightmares <dbl>, numKids <dbl>, numSiblng <dbl>, parDivorce <dbl>, physAct <dbl>, probation <dbl>, race <dbl>, religion <dbl>,
## # remedialClass <dbl>, seeDad <dbl>, shynessProb <dbl>, sleepProb <dbl>, smokes <dbl>, unempEen <dbl>, weight <dbl>, welfare <dbl>, dadOccPrstg <dbl>,
## # HHsize <dbl>, liveDad <dbl>, ParEngmnt <dbl>, privateSchl <dbl>, urban <dbl>, readingMat <dbl>, tantrums <dbl>, ageMarried <dbl>, bornLate <dbl>,
## # durBreastFed <dbl>, gender <dbl>, dscplnFreq <dbl>, eatingHabits <dbl>, exercise <dbl>, activities <dbl>, Anger <dbl>, physhlthevnt <dbl>,
## # age <dbl>, BMI <dbl>
2.9.3 Personality Variables
nlsy_pers <- cnlsy_codebook %>%
filter(category == "pers") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
filter(name != "yearBrth") %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
left_join(p_waves %>% filter(Study == "NLSY") %>% select(name = p_item, Used)) %>%
filter(year %in% Used)
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
nlsy_pers <- nlsy_pers %>%
select(-orig_itemname) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
nlsy_pers <- nlsy_pers %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code),
value, reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
nlsy_alpha <- nlsy_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
# create composites
nlsy_pers <- nlsy_pers %>%
filter(!is.na(value)) %>%
group_by(SID, yearBrth, name, year) %>%
summarize(value = mean(value, na.rm = T))
## # A tibble: 77,697 x 5
## # Groups: SID, yearBrth, name [63,435]
## SID yearBrth name year value
## <dbl> <dbl> <chr> <chr> <dbl>
## 1 201 1993 IQ 2000 17.6
## 2 202 1994 IQ 2000 16.3
## 3 301 1981 A 2008 2
## 4 301 1981 C 2008 3.5
## 5 301 1981 DEP 2008 0.571
## 6 301 1981 E 2008 2.5
## 7 301 1981 LOC 2008 1.14
## 8 301 1981 N 2008 0.5
## 9 301 1981 O 2008 3.5
## 10 301 1981 SE 2008 1.7
## # … with 77,687 more rows
2.9.4 Outcome Variables
nlsy_out <- cnlsy_codebook %>%
filter(category == "out" & year != "XRND") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
filter(name != "yearBrth") %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
full_join(crossing(p_year = nlsy_waves$Used, name = unique((.)$name)))
old.names <- (cnlsy_codebook %>% filter(category == "out" & year == "XRND"))$orig_itemname
nlsy_out_stp <- nlsy %>%
select(SID = C0000100, yearBrth = C0005700, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, -yearBrth, na.rm=T) %>%
left_join(cnlsy_codebook %>% filter(category == "out") %>%
select(name:year, orig_itemname, recode, comp_rule, item_rule))
# recode
recode_fun <- function(rule, y, p_year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# coding unemployment
jobs <- nlsy_out %>%
filter(name == "unemployed" & (grepl("start", itemname) | grepl("end", itemname))) %>%
select(name, SID, value, itemname, year, p_year) %>%
mutate(value = ifelse(is.na(value) | value < 0, NA, value),
timing = ifelse(grepl("start", itemname), "start", "end"),
itemname = str_remove(itemname, "start"), itemname = str_remove(itemname, "end")) %>%
filter(!is.na(value)) %>%
separate(itemname, c("time", "job"), -1) %>%
spread(time, value) %>%
mutate(centMnth = (Year - 1950)*12 + Month) %>%
select(-Year, -Month) %>%
spread(timing, centMnth) %>%
group_by(SID, year, p_year) %>%
mutate(lead_end = lead(end),
gap = start-lead_end) %>%
group_by(name, SID, year, p_year, job) %>%
summarize(value = ifelse(gap > 1, 1, 0)) %>%
group_by(name, SID, year, p_year) %>%
summarize(value = max(value, na.rm = T))
nlsy_out <- nlsy_out %>%
filter(!(name == "unemployed" & (grepl("start", itemname) | grepl("end", itemname)))) %>%
select(-orig_itemname) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data) %>%
distinct()
nlsy_waves <- p_waves %>% filter(Study == "NLSY") %>% select(Used) %>% distinct()
nlsy_out_stp <- nlsy_out_stp %>%
full_join(crossing(p_year = nlsy_waves$Used, year = "XRND")) %>%
group_by(recode, p_year, year) %>%
nest() %>%
ungroup() %>%
mutate(recode = str_replace(recode, "if\\(p_years", "ifelse\\(p_year"),
data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data) %>%
select(p_year, SID, name, value)
nlsy_out_stp <- nlsy_out_stp %>%
filter(name == "education") %>%
mutate(name = "edu") %>%
group_by(name, p_year, SID) %>%
summarize(value = max(value, na.rm = T))%>%
ungroup() %>%
full_join(nlsy_out_stp %>% filter(name != "education"))
# composite within years
nlsy_out <- nlsy_out %>%
select(name:year) %>%
# filter(year <= max(gsoep_waves$Used)) %>%
group_by(SID, name, year, yearBrth) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))
# composite across years
comp_fun <- function(P_year){
nlsy_out %>%
left_join(jobs %>% filter(p_year == P_year) %>% select(-p_year)) %>%
mutate(group = ifelse(year > P_year, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
nlsy_out <- tibble(p_year = nlsy_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data) %>%
full_join(nlsy_out_stp)
## # A tibble: 553,440 x 6
## p_year SID name future past value
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 2006 201 chldbrth NA NA NA
## 2 2006 201 chldmvout NA NA NA
## 3 2006 201 crim 0 NA 0
## 4 2006 201 divorced NA NA NA
## 5 2006 201 frstjob NA NA NA
## 6 2006 201 mntlhlthevnt NA NA NA
## 7 2006 201 mortality 0 0 0
## 8 2006 201 physhlthevnt NA NA NA
## 9 2006 201 separated NA NA NA
## 10 2006 201 unemployed NA NA NA
## # … with 553,430 more rows
2.9.5 Covariates
nlsy_match <- nlsy_pers %>%
ungroup() %>%
mutate(year = as.numeric(year)) %>%
spread(name, value) %>%
full_join(nlsy_match %>% rename(year = p_year)) %>%
# physical health event
left_join(nlsy_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct()
# age
nlsy_match <- nlsy_match %>%
mutate(age = year - yearBrth,
BMI = weight/ (height/100)^2)
nlsy_match <- nlsy_match %>%
group_by(year) %>%
mutate_at(vars(education, numKids, married, smokes, alcohol), ~ifelse(is.na(.) & age < 19, 0, .))
nlsy_match <- nlsy_match %>%
ungroup() %>%
select(SID, dadEdu, momEdu) %>%
distinct() %>%
gather(item,value, na.rm = T, -SID) %>%
group_by(SID, item) %>%
summarize(value = max(value, na.rm = T)) %>%
spread(item,value) %>%
full_join(nlsy_match %>% select(-momEdu, -dadEdu))
nlsy_out <- nlsy_out %>%
select(-past, -future) %>%
distinct()
nlsy_SCA <- nlsy_match %>% rename(PhysFunc = physAct) %>%
select(SID, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
unique(specifications$name)[!unique(specifications$name) %in% colnames(nlsy_match)]
## # A tibble: 34,590 x 20
## SID p_year age education gender grsWages race physhlthevnt SRhealth smokes alcohol exercise BMI married numKids parDivorce PhysFunc religion
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 201 2000 7 0 1 32964. NA NA 1 0 0 NA 22.5 0 0 0 0 NA
## 2 201 2006 13 0 1 34682. NA NA 1 0 0 NA 25.4 0 0 0 0 1
## 3 201 2008 15 0 1 32837. NA NA 1 0 0 NA 25.4 0 0 0 0 1
## 4 202 2000 6 0 1 32964. NA NA 1 0 0 NA 22.5 0 0 0 0 NA
## 5 202 2006 12 0 1 34682. NA NA 1 0 0 NA 33.8 0 0 0 0 NA
## 6 202 2008 14 0 1 32837. NA NA 1 0 0 NA 33.8 0 0 0 0 NA
## 7 301 2008 27 0 1 29323. 2 NA 2 0 NA 4 21.0 0 NA 0 0 1
## 8 301 2000 19 0 1 28167. 2 NA 1 0 NA NA 18.6 0 NA 0 0 1
## 9 301 2006 25 0 1 27278. 2 NA 1.67 0 NA NA 18.6 0 NA 0 0 1
## 10 302 2000 17 0 1 27311. 2 NA 2 1 1 NA 20.6 0 0 0 0 1
## # … with 34,580 more rows, and 2 more variables: parEdu <dbl>, parOccPrstg <dbl>
2.9.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_na <- function(x) any(!is.na(x))
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
}
nlsy_match_imp2 <- nlsy_match %>%
group_by(SID, year) %>%
mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
select(-momOccPrstg, -dadOccPrstg) %>%
ungroup() %>%
mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
nlsy_match_imp <- nlsy_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>% rename(PhysFunc = physAct)),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)), hlthProbs) %>% group_by(SID) %>% mutate(physhlthevnt = max(cbind(physhlthevnt, hlthProbs))-1) %>% ungroup() %>% select(-hlthProbs)))
nlsy_match_imp_long <- nlsy_match_imp %>%
mutate(imp_df = map(imp_df, ~(.) %>% mutate_if(is.factor, ~as.numeric(as.character(.))))) %>%
select(p_year = year, imp_df) %>%
mutate(imp_df = map(imp_df, ~if(any(colnames(.) == "relSat")){(.) %>% mutate(relSat = as.numeric(as.character(relSat)))})) %>%
unnest(imp_df)
nlsy_SCA_imp <- nlsy_match_imp %>%
select(p_year = year, imp_sca) %>%
unnest(imp_sca) %>%
left_join(nlsy_SCA %>% select(p_year, SID,
colnames(nlsy_SCA)[!colnames(nlsy_SCA) %in% colnames(.)])) %>%
left_join(nlsy_match %>% select(SID, p_year = year)) %>%
mutate(physhlthevnt = factor(physhlthevnt))
unique(specifications$name)[!unique(specifications$name) %in% colnames(nlsy_SCA_imp)]
save(nlsy_match_imp_long, nlsy_SCA_imp,
file = sprintf("%s/data/imputed/nlsy_imputed_small.RData", wd))
save(nlsy_match_imp,
file = sprintf("%s/data/imputed/nlsy_imputed.RData", wd))
save(nlsy_alpha, nlsy_pers, nlsy_out, nlsy_match, nlsy_SCA,
file = sprintf("%s/data/clean/nlsy_cleaned.RData", wd))
save(nlsy, file = sprintf("%s/data/clean/nlsy_raw.RData", wd))
rm(list =ls()[grepl("nlsy", ls())])
2.10 SHP
The Swiss Household Panel Study (SHP; Voorpostel et al., 2016) “Living in Switzerland” is an ongoing longitudinal study of households in Switzerland. These data are available online, through application from https://forsbase.unil.ch/project/study-public-overview/15632/0/.
Participants were recruited from more than 10,000 individuals from the households whose members represent the non-institutional resident population of Switzerland. Data have been collected annually since 1999. The latest data release includes data up to 2018. On average, about 5,000 individuals are sampled at each wave. More documentation can be found at LINK, but, in short, the SHP is a nationally representative sample of Swiss citizens.
Sample sizes vary by year, ranging from 5,220 (2003) to 13,295 (2013). This provides 99/% power to detect a zero-order correlation effect size of ~.06, two tailed at alpha .05.
2.10.1 Load Data
shp_read_fun <- function(x){
if(grepl("SHP0_BV", x)){old.names <- str_to_lower(old.names)}
y <- sprintf("%s/data/shp/%s", wd, x) %>% haven::read_sav(.) %>%
haven::zap_labels(.) %>% select(one_of(old.names))
if(grepl("SHP0_BV", x)){colnames(y) <- str_to_upper(colnames(y))}
return(y)
}
shp_codebook <- (codebook %>% filter(study == "SHP"))$codebook[[1]] %>%
filter(year <= 2017 | is.na(year) | year == 0)
shp_codebook
## # A tibble: 1,493 x 18
## study dataset category name itemname wave year new_itemname orig_itemname stem description scale reverse_code recode mini maxi comp_rule ...18
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <lgl>
## 1 shp P match dadAl… dadalive 15 2013 shp__match_… P13N82 N82 "Father: a… "-8\… <NA> ifels… NA NA max NA
## 2 shp P match dadAl… dadalive 18 2016 shp__match_… P16N82 N82 "Father: a… "-8\… <NA> ifels… NA NA max NA
## 3 shp P match dadEdu dadhighe… 0 0 shp__match_… P$$O17 O17 "Highest l… "-8 … <NA> ifels… NA NA max NA
## 4 shp P match degPh… hlthimpe… 1 1999 shp__match_… P99C08 C08 "Health im… "-8\… <NA> ifels… NA NA max NA
## 5 shp P match degPh… hlthimpe… 2 2000 shp__match_… P00C08 C08 "Health im… "-8\… <NA> ifels… NA NA max NA
## 6 shp P match degPh… hlthimpe… 3 2001 shp__match_… P01C08 C08 "Health im… "-8\… <NA> ifels… NA NA max NA
## 7 shp P match degPh… hlthimpe… 4 2002 shp__match_… P02C08 C08 "Health im… "-8\… <NA> ifels… NA NA max NA
## 8 shp P match degPh… hlthimpe… 5 2003 shp__match_… P03C08 C08 "Health im… "-8\… <NA> ifels… NA NA max NA
## 9 shp P match degPh… hlthimpe… 6 2004 shp__match_… P04C08 C08 "Health im… "-8\… <NA> ifels… NA NA max NA
## 10 shp P match degPh… hlthimpe… 7 2005 shp__match_… P05C08 C08 "Health im… "-8\… <NA> ifels… NA NA max NA
## # … with 1,483 more rows
old.names <- unique(shp_codebook$orig_itemname) %>% str_to_upper
datasets <- sprintf("%s/data/shp", wd) %>% list.files()
shp <- tibble(datasets = datasets) %>%
mutate(data = map(datasets, shp_read_fun),
ncol = map(data, ncol)) %>%
unnest(ncol) %>%
filter(ncol > 1)
shp_grid <- shp %>% filter(datasets == "SHP_MH.sav")
shp <- shp %>% filter(datasets != "SHP_MH.sav")
shp <- reduce(shp$data, full_join) %>% haven::zap_labels(.)
save(shp, file = sprintf("%s/data/clean/shp_raw.RData", wd))
2.10.2 Matching Variables
rename_fun <- function(cb, var){
print(var)
old.names <- unique((shp_codebook %>% filter(name == var))$orig_itemname)
df <- shp %>%
select(SID = IDPERS, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, na.rm=T)
if(length(old.names) > 1){
df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
} else {
df %>% left_join(cb %>% select(-(itemname:year), -new_itemname, -dataset) %>% distinct()) %>% mutate(year = "0")
}
}
# rename variables
shp_match <- shp_codebook %>%
filter(category == "match" & year != 0) %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
# filter(name != "yearBrth") %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct()
# single time point variables
old.names <- (shp_codebook %>% filter(category == "match" & year == "0"))$orig_itemname
shp_match_stp <- shp %>%
select(SID = IDPERS, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID) %>%
left_join(shp_codebook %>% select(name:year, orig_itemname, recode, comp_rule))
# recoding
recode_fun <- function(rule, y, p_year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
yrBrth <- shp_match_stp %>% filter(name == "yearBrth") %>%
mutate(yearBrth = ifelse(value < 0, NA, value)) %>%
select(SID, yearBrth)
shp_match <- shp_match %>%
left_join(yrBrth) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(recode = mapvalues(recode, "ifelse(is.na(x) | x < 0), NA, ifelse(x > 0, 1, 0))", "ifelse(is.na(x) | x < 0, NA, ifelse(x > 0, 1, 0))"),
recode = mapvalues(recode, "ifelse(is.na(x) | x < 0, ifelse(x %in% 7:14, 0, 1))", "ifelse(is.na(x) | x < 0, NA, ifelse(x %in% 7:14, 0, 1))"),
data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
shp_waves <- p_waves %>% filter(Study == "SHP") %>% select(Used) %>% distinct()
shp_match_stp <- shp_match_stp %>%
full_join(crossing(p_year = shp_waves$Used, year = 0)) %>%
left_join(yrBrth) %>%
group_by(recode, p_year, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data)
# reverse code
shp_match <- shp_match %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T),
multiply = prod(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, yearBrth, name, year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
shp_match <- shp_match %>%
filter(year <= max(shp_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
shp_match %>%
filter(year <= p_year & comp_rule == rule) %>%
left_join(
shp_match_stp %>%
select(SID, yearBrth, p_year, name, value, comp_rule) %>%
filter(comp_rule == rule & p_year == p_year)
) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
shp_match <- crossing(
p_year = shp_waves$Used,
comp_rule = unique(shp_match$comp_rule)
) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
full_join(shp_match_stp %>% select(SID, yearBrth, name, value, p_year)) %>%
filter(!is.na(value)) %>%
distinct() %>%
spread(name,value)
## # A tibble: 139,867 x 56
## year SID A C DEP E LOC N `NA` O OP PA SE SS SWL ageMarried dadEdu dadOccPrstg degPhysFunc disability
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2000 4101 NA NA 0 NA NA NA NA NA 9 NA NA 9 10 24 0 73.9 0 0
## 2 2000 4102 NA NA 5 NA NA NA NA NA 7 NA NA 10 10 21 0 36.3 8 1
## 3 2000 5101 NA NA 1 NA NA NA NA NA 10 NA NA 6.5 8 38 0 39.6 1 0
## 4 2000 6101 NA NA 8 NA NA NA NA NA 0 NA NA 4.75 5 NA 0 38.0 10 1
## 5 2000 13102 NA NA 3 NA NA NA NA NA 7 NA NA 8.7 9 15 0 51.6 0 0
## 6 2000 14101 NA NA 0 NA NA NA NA NA 8 NA NA 5.88 8 24 0 52.1 3 1
## 7 2000 26101 NA NA 2 NA NA NA NA NA 0 NA NA 8.12 5 30 0 43.4 6 1
## 8 2000 26102 NA NA 0 NA NA NA NA NA 9 NA NA 7.5 10 25 0 44.2 0 1
## 9 2000 27101 NA NA 0 NA NA NA NA NA 9 NA NA 7.5 8 17 0 NA 6 0
## 10 2000 27102 NA NA 1 NA NA NA NA NA 0 NA NA 7 8 24 0 52.5 5 0
## # … with 139,857 more rows, and 36 more variables: drVisits <dbl>, education <dbl>, employed <dbl>, exercise <dbl>, gender <dbl>, grsWages <dbl>,
## # height <dbl>, HHID <dbl>, HHsize <dbl>, hlthcare <dbl>, imprvHealth <dbl>, loneliness <dbl>, married <dbl>, meaning <dbl>, momEdu <dbl>,
## # momOccPrstg <dbl>, numKids <dbl>, parDivorce <dbl>, physAct <dbl>, PhysFunc <dbl>, race <dbl>, religion <dbl>, religiosity <dbl>, satFinances <dbl>,
## # satFncChng <dbl>, satHealth <dbl>, satHH <dbl>, satSchl <dbl>, SRhealth <dbl>, unemployBen <dbl>, weight <dbl>, welfare <dbl>, yearBrth <dbl>,
## # physhlthevnt <dbl>, age <dbl>, BMI <dbl>
2.10.3 Personality Variables
shp_pers <- shp_codebook %>%
filter(category == "pers") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
filter(name != "yearBrth") %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
left_join(p_waves %>% filter(Study == "SHP") %>% select(name = p_item, Used)) %>%
filter(year %in% Used)
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
shp_pers <- shp_pers %>%
select(-orig_itemname) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
shp_pers <- shp_pers %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code),
value, reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
shp_alpha <- shp_pers %>%
select(name, itemname, year, SID, value, comp_rule) %>%
group_by(name, year, SID, comp_rule, itemname) %>%
summarize(value = mean(value, na.rm = T)) %>%
group_by(name, year, comp_rule) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(value = mean))),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
# create composites
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
# create composites
shp_pers <- shp_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "average", comp_rule),
data = map2(data, comp_rule, comp_fun)) %>%
unnest()
## # A tibble: 187,730 x 5
## name year comp_rule SID value
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 A 2009 average 4101 7
## 2 A 2009 average 4102 5
## 3 A 2009 average 4103 1.5
## 4 A 2009 average 4104 5
## 5 A 2009 average 5101 5.5
## 6 A 2009 average 5104 5
## 7 A 2009 average 21101 7.5
## 8 A 2009 average 26101 10
## 9 A 2009 average 27101 6
## 10 A 2009 average 27102 5.5
## # … with 187,720 more rows
2.10.4 Outcome Variables
shp_out <- shp_codebook %>%
filter(category == "out" & year != "0") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
full_join(crossing(p_year = shp_waves$Used, name = unique((.)$name)))
old.names <- (shp_codebook %>% filter(category == "out" & year == "0"))$orig_itemname
shp_out_stp <- shp %>%
select(SID = IDPERS, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID) %>%
left_join(shp_codebook %>% select(name:year, orig_itemname, recode, comp_rule))
# recode
recode_fun <- function(rule, y, p_year, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
shp_out <- shp_out %>%
left_join(yrBrth) %>%
select(-orig_itemname) %>%
group_by(recode, year, p_year) %>%
nest() %>%
ungroup() %>%
mutate(year = as.numeric(year),
data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
unnest(data) %>%
distinct()
shp_waves <- p_waves %>% filter(Study == "SHP") %>% select(Used) %>% distinct()
shp_out_stp <- shp_out_stp %>%
full_join(crossing(p_year = shp_waves$Used, year = 0)) %>%
group_by(recode, p_year, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data) %>%
left_join(yrBrth)
# composite within years
shp_out <- shp_out %>%
select(year, p_year:itemname, yearBrth) %>%
distinct() %>%
# filter(year <= max(gsoep_waves$Used)) %>%
group_by(SID, name, year, yearBrth, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))
shp_out <- shp_out %>%
filter(name == "chldmvout") %>%
group_by(SID, p_year) %>%
mutate(max = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(value < max, 1, 0)) %>%
select(-max) %>%
full_join(shp_out %>% filter(name != "chldmvout"))
# composite across years
comp_fun <- function(P_year){
shp_out %>%
mutate(group = ifelse(year > P_year, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
shp_out <- tibble(p_year = shp_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data) %>%
distinct() %>%
full_join(shp_out_stp %>% select(p_year, name, SID, value)) %>%
distinct() %>% left_join(yrBrth)
## # A tibble: 1,323,120 x 7
## p_year SID name future past value yearBrth
## <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2009 4101 chldbrth 1 1 NA 1965
## 2 2009 4101 chldmvout 1 1 NA 1965
## 3 2009 4101 divorced 0 0 0 1965
## 4 2009 4101 edu 1 0 1 1965
## 5 2009 4101 frstjob 0 0 0 1965
## 6 2009 4101 married 0 0 0 1965
## 7 2009 4101 mortality 0 0 0 1965
## 8 2009 4101 mvInPrtnr 1 1 NA 1965
## 9 2009 4101 physhlthevnt 0 0 0 1965
## 10 2009 4101 vlntred 0 1 NA 1965
## # … with 1,323,110 more rows
2.10.5 Covariates
shp_match <- shp_pers %>%
select(-comp_rule) %>%
spread(name, value) %>%
full_join(shp_match %>% rename(year = p_year)) %>%
# physical health event
left_join(shp_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct()
# age
shp_match <- shp_match %>%
mutate(age = year - yearBrth,
BMI = weight / (height/100)^2,
parDivorce = ifelse(is.na(parDivorce), 0, parDivorce))
shp_out <- shp_out %>%
select(-past, -future) %>%
distinct()
shp_SCA <- shp_match %>%
select(SID, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
unique(specifications$name)[!unique(specifications$name) %in% colnames(shp_match2)]
## # A tibble: 139,867 x 18
## SID p_year age education gender grsWages race physhlthevnt SRhealth exercise BMI married numKids parDivorce PhysFunc religion parEdu
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4101 2000 35 0 0 57750 0 0 1 1 NA 1 3 0 0 1 0
## 2 4102 2000 32 0 1 57750 0 1 3 2 NA 1 3 0 1 1 0
## 3 5101 2000 39 1 0 86700 0 1 2 2 NA 1 2 0 1 0 0
## 4 6101 2000 36 0 1 49400 0 0 3.5 0 NA 0 0 0 1 1 0
## 5 13102 2000 27 0 1 34270 0 0 2 2 NA 0 2 0 1 1 0
## 6 14101 2000 66 0 0 72130 0 0 2 2 NA 1 0 0 1 1 0
## 7 26101 2000 78 0 1 57400 0 0 3 4 NA 1 0 0 1 1 0
## 8 26102 2000 75 0 0 57400 0 0 2 7 NA 1 0 0 1 1 0
## 9 27101 2000 32 0 1 37050 0 0 3 7 NA 1 3 0 1 1 0
## 10 27102 2000 32 0 0 37050 0 1 1 3 NA 1 3 0 1 1 0
## # … with 139,857 more rows, and 1 more variable: parOccPrstg <dbl>
2.10.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
mice(df, m = 5, maxit=5, method = "cart", printFlag=TRUE)
}
shp_match_imp <- shp_match %>%
rename(NegAff = `NA`) %>%
group_by(SID, year) %>%
mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
select(-momOccPrstg, -dadOccPrstg) %>%
ungroup() %>%
mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
shp_match_imp <- shp_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
shp_match_imp_long <- shp_match_imp %>%
mutate(imp_df = map(imp_df, ~(.) %>% mutate_if(is.factor, ~as.numeric(as.factor(.))))) %>%
select(p_year = year, imp_df) %>%
mutate(imp_df = map(imp_df, ~if(any(colnames(.) == "relSat")){(.) %>% mutate(relSat = as.numeric(as.character(relSat))); (.)}else{(.)})) %>%
unnest(imp_df)
shp_SCA_imp <- shp_match_imp %>%
select(p_year = year, imp_sca) %>%
unnest(imp_sca) %>%
left_join(shp_SCA %>% select(p_year=year, SID,
colnames(shp_SCA)[!colnames(shp_SCA) %in% colnames(.)]))
shp_SCA_imp <- shp_SCA_imp %>%
select(-BMI) %>%
left_join(shp_SCA_imp %>%
select(SID, BMI) %>%
distinct() %>%
gather(item, value, -SID, na.rm = T) %>%
group_by(SID,item) %>%
summarize(value = mean(value, na.rm = T)) %>%
spread(item,value))
unique(specifications$name)[!unique(specifications$name) %in% colnames(shp_SCA_imp)]
save(shp_match_imp_long, shp_SCA_imp,
file = sprintf("%s/data/imputed/shp_imputed_small.RData", wd))
save(shp_match_imp,
file = sprintf("%s/data/imputed/shp_imputed.RData", wd))
save(shp_alpha, shp_pers, shp_out, shp_match, shp_SCA,
file = sprintf("%s/data/clean/shp_cleaned.RData", wd))
rm(list =ls()[grepl("shp", ls())])
2.11 WLS
The Wisconsin Longitudinal Study (WLS) is an ongoing longitudinal study of individuals who graduated from Wisconsin high schools in 1957 and were born between 1937 and 1940 as well as their siblings.
Graduates were randomly recruited from Wisconsin high schools in 1957 and born between 1937 and 1940. In 1977, at least one sibling of the original graduates from 2,100 families were also invited to participate in the study. As such, the study is representative of older, white Americans who have at least a high school education. Graduate data have been collected in in 1957, 1964, 1975, 1992, 2004, and 2011, and sibling data have been collected in 1977, 1994, 2005, and 2011. Personality data were initially collected in 1992 for graduates and 1994 for siblings. More documentation can be found at https://www.ssc.wisc.edu/wlsresearch/.
Sample sizes vary by wave, from 9,681 (2011) to 10,317 (1957). This provides 99/% power to detect zero-order correlation effect sizes of ~.06, two-tailed at alpha .05.
2.11.1 Load Data
wls_codebook <- (codebook %>% filter(study == "WLSS"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_lower(orig_itemname),
sib = ifelse(grepl("rad", dataset), "grad", ifelse(grepl("ibl", dataset), "sib", NA)))
wls_codebook
## # A tibble: 834 x 19
## study dataset category name itemname year new_itemname orig_itemname description scale reverse_code recode mini maxi comp_rule ...16 ...17 ...18
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 wlss 1992 -… match ageF… ageFrst… 1992 wlss__match… ru020re What age w… ".\t… <NA> ifels… NA NA skip <NA> <NA> <NA>
## 2 wlss 1993 -… match ageF… ageFrst… 1993 wlss__match… su020re What age w… ".\t… <NA> ifels… NA NA skip <NA> <NA> <NA>
## 3 wlss 1992 -… match alco… alcohol 1992 wlss__match… ru026re During the… <NA> <NA> ifels… NA NA skip <NA> <NA> <NA>
## 4 wlss 1993 -… match alco… alcohol 1993 wlss__match… su026re During the… <NA> <NA> ifels… NA NA skip <NA> <NA> <NA>
## 5 wlss 1992 -… match anem… anemia 1992 wlss__match… mx083rer Has a medi… ".\t… <NA> ifels… NA NA skip <NA> <NA> <NA>
## 6 wlss 1993 -… match anem… anemia 1992 wlss__match… nx103rer Has a medi… "\".… <NA> ifels… NA NA skip <NA> <NA> <NA>
## 7 wlss 1992 -… match appr… apprncR… 1992 wlss__match… mx004rer Compared w… ".\t… <NA> ifels… NA NA skip <NA> <NA> <NA>
## 8 wlss 1993 -… match appr… apprncR… 1993 wlss__match… nx004rer Compared w… ".\t… <NA> ifels… NA NA skip <NA> <NA> <NA>
## 9 wlss 1992-1… match auth… afraidV… 1992 wlss__match… mn007rer “I am not … "1 a… yes ifels… 1 6 average <NA> <NA> <NA>
## 10 wlss 1992-1… match auth… afraidV… 1992 wlss__match… np007rer “I am not … "1 a… yes ifels… 1 6 average <NA> <NA> <NA>
## # … with 824 more rows, and 1 more variable: sib <chr>
2.11.2 Matching Variables
rename_fun <- function(cb, var){
old.names <- unique((wls_codebook %>% filter(name == var))$orig_itemname)
df <- wls %>%
select(SID = idpub, HHID = familypub, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, -HHID, na.rm=T)
if(length(old.names) > 1){
df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule, sib))
} else {
df %>% left_join(cb %>% select(-(itemname:year), -new_itemname, -dataset) %>% distinct()) %>% mutate(year = "0")
}
}
# rename variables
wls_match <- wls_codebook %>%
filter(category == "match" & year != 0) %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
# filter(name != "yearBrth") %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct()
# single time point variables
old.names <- (wls_codebook %>% filter(category == "match" & year == "0"))$orig_itemname
wls_match_stp <- wls %>%
select(SID = idpub, HHID = familypub, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, -HHID, na.rm=T) %>%
left_join(wls_codebook %>% select(name:year, orig_itemname, recode, comp_rule, sib)) %>%
unite(SID, SID, sib, sep = "")
# recoding
recode_fun <- function(rule, y, p_year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
yrBrth <- wls_match_stp %>% filter(name == "yearBrth") %>%
mutate(yearBrth = ifelse(value < 0, NA, value +1900)) %>%
select(SID, HHID, yearBrth)
wls_match <- wls_match %>%
unite(SID, SID, sib, sep = "") %>%
left_join(yrBrth) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
wls_match_par <- wls_match %>% filter(grepl("mom", name) |
grepl("dad", name) | name == "parGrsWages") %>%
group_by(year, HHID, name, comp_rule) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value),
comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
filter(!is.na(value)) %>%
group_by(HHID, name) %>%
summarize(value = fun_call(value, comp_rule[1])) %>%
ungroup() %>%
spread(name, value) %>%
mutate(parGrsWages = parGrsWages*100)
wls_waves <- p_waves %>% filter(Study == "WLS") %>% select(Used) %>% distinct()
wls_match_stp <- wls_match_stp %>%
full_join(crossing(p_year = wls_waves$Used, year = "0")) %>%
left_join(yrBrth) %>%
group_by(recode, p_year, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data) %>%
filter(!is.na(value)) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
group_by(p_year, SID, HHID, name, yearBrth) %>%
summarize(value = fun_call(value, comp_rule[1])) %>%
ungroup() %>%
mutate(value = ifelse(value < 0, NA, value)) %>%
spread(name, value)
# reverse code
wls_match <- wls_match %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T),
multiply = prod(x, na.rm = T))
}
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, HHID, yearBrth, name, year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
wls_match <- wls_match %>%
filter(!(grepl("mom", name) | grepl("dad", name) | name == "parGrsWages")) %>%
filter(year <= max(wls_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
wls_match %>%
filter(year <= p_year & comp_rule == rule) %>%
group_by(SID, HHID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
wls_match <- crossing(
p_year = wls_waves$Used,
comp_rule = unique(wls_match$comp_rule)
) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
data = map2(comp_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-comp_rule) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>% full_join(wls_match_par) %>%
full_join(wls_match_stp)
## # A tibble: 44,342 x 69
## year SID A C DEP E IQ LOC N `NA` O OP PA SE HHID yearBrth authonomy EnvMastery exercise grsWages
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1992 9000… 4.2 5 1 4.20 NA 5.33 4.36 1.33 4.6 NA NA 4.67 1.21e6 1938 4.67 4.56 1.5 66000
## 2 1992 9000… 6.4 6.4 0.55 6.8 NA 5.67 1.72 0 5.2 NA NA 5.44 1.20e6 1939 5.31 5.31 3.5 38500
## 3 1992 9000… 6.2 4.8 0.7 3.8 NA 6 2.2 0.375 2.4 NA 6.75 3.86 1.20e6 1933 NA NA 2.5 NA
## 4 1992 9000… NA NA NA NA NA NaN NA NA NA NA NA NaN 1.20e6 1938 NA NA NA 12500
## 5 1992 9000… 4.4 5.8 0.25 4.6 NA 5.67 1.96 0 3.2 NA NA 6.09 1.21e6 1939 5.09 5.64 2.5 48800
## 6 1992 9000… NA NA NA NA 9 NA NA NA NA NA NA 5.5 1.21e6 1944 NA NA NA NA
## 7 1992 9000… NA NA NA NA NA 1 NA NA NA NA NA 4.55 1.21e6 1938 4.27 4 NA 19451.
## 8 1992 9000… 5.4 6 1.05 4 4 3.5 2.92 0 4.4 NA 7 4.33 1.21e6 1941 NA NA 4 NA
## 9 1992 9000… 5.40 6.4 0.6 5.2 NA 5 3.16 0 5.2 NA NA 5.78 1.20e6 1939 5.22 5.44 3 65350
## 10 1992 9000… 5.2 6.8 0.6 5.4 NA 6.33 3.4 0.333 5.8 NA NA 6.22 1.21e6 1939 5.33 6 3 44250
## # … with 44,332 more rows, and 49 more variables: hospitalized <dbl>, PersGrowth <dbl>, Purpose <dbl>, hlthcare <dbl>, PhysFunc <dbl>, welfare <dbl>,
## # ageMarried <dbl>, disability <dbl>, urban <dbl>, ageFrstDep <dbl>, alcohol <dbl>, anemia <dbl>, apprncRel10yrs <dbl>, backPain <dbl>, BMI <dbl>,
## # depIntUnit <dbl>, depLngthUnit <dbl>, dizziness <dbl>, education <dbl>, employed <dbl>, everSmoked <dbl>, fatigue <dbl>, headaches <dbl>,
## # height <dbl>, HHsize <dbl>, hlthRel10yrs <dbl>, hlthRelOthrs <dbl>, married <dbl>, numKids <dbl>, parDivorce <dbl>, religion <dbl>, satFam <dbl>,
## # satJob <dbl>, sleepProb <dbl>, smokePacks <dbl>, smokes <dbl>, smokeYears <dbl>, SRhealth <dbl>, weight <dbl>, dadAlive <dbl>, dadEdu <dbl>,
## # dadJob <dbl>, dadOccPrstg <dbl>, momAlive <dbl>, momEdu <dbl>, momOccPrstg <dbl>, parGrsWages <dbl>, gender <dbl>, physhlthevnt <dbl>
2.11.3 Personality Variables
wls_pers <- wls_codebook %>%
filter(category == "pers") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
filter(name != "yearBrth") %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
mutate(year = ifelse(sib == "sib", as.numeric(year) - 1, year)) %>%
left_join(p_waves %>% filter(Study == "WLS") %>% select(name = p_item, Used)) %>%
filter(year %in% Used) %>%
unite(SID, SID, sib, sep = "")
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
wls_pers <- wls_pers %>%
select(-orig_itemname) %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
wls_pers <- wls_pers %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code),
value, reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
wls_alpha <- wls_pers %>%
select(name, itemname, year, SID, value, comp_rule) %>%
group_by(name, year, SID, comp_rule, itemname) %>%
summarize(value = mean(value, na.rm = T)) %>%
group_by(name, year, comp_rule) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(value = mean))),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
# create composites
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
# create composites
wls_pers <- wls_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "average", comp_rule),
data = map2(data, comp_rule, comp_fun)) %>%
unnest()
## # A tibble: 243,434 x 5
## name year comp_rule SID value
## <chr> <chr> <chr> <chr> <dbl>
## 1 A 1992 average 900021grad 4.2
## 2 A 1992 average 900026grad 6.4
## 3 A 1992 average 900026sib 6.2
## 4 A 1992 average 900042grad 4.4
## 5 A 1992 average 900043sib 5.4
## 6 A 1992 average 900054grad 5.40
## 7 A 1992 average 900069grad 5.2
## 8 A 1992 average 900074sib 6.8
## 9 A 1992 average 900075grad 5.4
## 10 A 1992 average 900078grad 6.6
## # … with 243,424 more rows
2.11.4 Outcome Variables
wls_out <- wls_codebook %>%
filter(category == "out" & year != "0") %>%
group_by(name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun)) %>%
unnest(data) %>%
distinct() %>%
full_join(crossing(p_year = wls_waves$Used, name = unique((.)$name))) %>%
unite(SID, SID, sib, sep = "")
old.names <- (wls_codebook %>% filter(category == "out" & year == "0"))$orig_itemname
wls_out_stp <- wls %>%
select(SID = idpub, HHID = familypub, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, -HHID, na.rm=T) %>%
left_join(wls_codebook %>% select(name:year, orig_itemname, recode, comp_rule, sib)) %>%
unite(SID, SID, sib, sep = "")
# recode
recode_fun <- function(rule, y, p_year, year){
yearBrth <- y$yearBrth
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
wls_out <- wls_out %>%
left_join(yrBrth) %>%
select(-orig_itemname) %>%
group_by(recode, year, p_year) %>%
nest() %>%
ungroup() %>%
mutate(recode = str_replace_all(recode, "ifesle", "ifelse"),
year = as.numeric(year),
data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
unnest(data) %>%
distinct()
wls_waves <- p_waves %>% filter(Study == "WLS") %>% select(Used) %>% distinct()
wls_out_stp <- wls_out_stp %>%
full_join(crossing(p_year = wls_waves$Used, year = "0")) %>%
group_by(recode, p_year, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
unnest(data) %>%
left_join(yrBrth)
# composite within years
wls_out <- wls_out %>%
select(year, p_year:itemname, yearBrth) %>%
distinct() %>%
# filter(year <= max(gsoep_waves$Used)) %>%
group_by(SID, name, HHID, year, yearBrth) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))
# composite across years
comp_fun <- function(P_year){
wls_out %>%
mutate(group = ifelse(year > P_year, "future", "past")) %>%
group_by(SID, HHID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA))))
}
wls_out <- tibble(p_year = wls_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data) %>%
full_join(wls_out_stp %>% select(p_year,name, SID, HHID, value, yearBrth)) %>%
filter(name != "chldbrth")
## # A tibble: 347,140 x 8
## p_year SID HHID name past future value yearBrth
## <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1992 900018grad 1201627 divorced 0 NA 0 NA
## 2 1992 900018grad 1201627 married NA NA NA NA
## 3 1992 900018grad 1201627 unemployed NA NA NA NA
## 4 1992 900018grad 1201627 vlntred NA NA NA NA
## 5 1992 900018sib 1201627 divorced 0 NA 0 NA
## 6 1992 900018sib 1201627 married NA NA NA NA
## 7 1992 900018sib 1201627 unemployed NA NA NA NA
## 8 1992 900018sib 1201627 vlntred NA NA NA NA
## 9 1992 900021grad 1205260 crim NA 0 0 NA
## 10 1992 900021grad 1205260 divorced 0 1 1 NA
## # … with 347,130 more rows
2.11.5 Covariates
wls_match <- wls_pers %>%
select(-comp_rule) %>%
mutate(year = as.numeric(year)) %>%
spread(name, value) %>%
full_join(wls_match %>% rename(year = p_year)) %>%
# physical health event
full_join(wls_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
distinct()
# age
wls_match <- wls_match %>%
mutate(age = year - yearBrth)
wls_out <- wls_out %>%
select(-past, -future, -yearBrth) %>%
distinct()
wls_SCA <- wls_match %>%
select(SID, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
ungroup() %>%
select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
unique(specifications$name)[!unique(specifications$name) %in% colnames(wls_match2)]
## # A tibble: 44,342 x 19
## SID p_year age education gender grsWages physhlthevnt SRhealth smokes alcohol exercise BMI married numKids parDivorce PhysFunc religion parEdu
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <dbl> <dbl>
## 1 9000… 1992 54 2 0 66000 1 4 NA 1 1.5 29 1 4 0 0 1 0
## 2 9000… 1992 53 2 0 38500 0 5 NA 1 3.5 27 1 3 0 0 1 2
## 3 9000… 1992 59 NA 0 NA NA NA NA <NA> 2.5 24 NA NA NA <NA> NA 2
## 4 9000… 1992 54 NA 0 12500 NA NA NA <NA> NA NA 1 2 0 <NA> 1 0
## 5 9000… 1992 53 NA 0 48800 0 5 0 0 2.5 27 1 3 0 0 1 2
## 6 9000… 1992 48 NA 0 NA NA NA NA <NA> NA NA NA NA NA <NA> NA 2
## 7 9000… 1992 54 NA 0 19451. NA NA NA 1 NA NA 1 2 0 <NA> 1 0
## 8 9000… 1992 51 NA 0 NA NA NA NA <NA> 4 25 NA NA NA <NA> NA 0
## 9 9000… 1992 53 NA 0 65350 0 5 0 1 3 27 1 3 0 0 1 0
## 10 9000… 1992 53 2 0 44250 0 5 0 1 3 25 1 3 0 0 0 1
## # … with 44,332 more rows, and 1 more variable: parOccPrstg <dbl>
2.11.6 Imputation
# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
mice(df, m = 5, maxit=5, printFlag=TRUE)
}
wls_match_imp <- wls_match %>%
rename(NegAff = `NA`) %>%
group_by(SID, year) %>%
mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
select(-momOccPrstg, -dadOccPrstg, -HHID) %>%
ungroup() %>%
mutate_all(~ifelse(is.infinite(.) | is.nan(.), NA, .)) %>%
filter(!is.na(year) & !is.na(SID)) %>%
group_by(year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% select_if(not_all_na)),
data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
imp = map(data, mice_fun))
beepr::beep(sound = 8)
wls_match_imp <- wls_match_imp %>%
mutate(imp_df = map(imp, ~mice::complete(., "long")),
imp_df = map(imp_df, ~(.) %>%
group_by(SID) %>%
mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
select(-momEdu, -dadEdu) %>%
ungroup()),
imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID, one_of(unique(specifications$name)))))
wls_match_imp_long <- wls_match_imp %>%
select(p_year = year, imp_df) %>%
mutate(imp_df = map(imp_df, ~(.) %>% mutate_if(is.factor, ~as.numeric(as.character(.)))),
imp_df = map(imp_df, function(x){if(any(colnames(x) == "relSat")){(x) %>% mutate(relSat = as.numeric(as.character(relSat)))} else{x}})) %>%
unnest(imp_df)
wls_SCA_imp <- wls_match_imp %>%
select(p_year = year, imp_sca) %>%
unnest(imp_sca) %>%
left_join(wls_SCA %>% select(p_year=year, SID,
colnames(wls_SCA)[!colnames(wls_SCA) %in% colnames(.)])) %>%
mutate(race = factor(0)) %>%
mutate_at(vars(alcohol, PhysFunc), ~factor(ifelse(. > 0, 1, .)))
save(wls_match_imp_long, wls_SCA_imp,
file = sprintf("%s/data/imputed/wls_imputed_small.RData", wd))
save(wls_match_imp,
file = sprintf("%s/data/imputed/wls_imputed.RData", wd))
save(wls_alpha, wls_pers, wls_out, wls_match, wls_SCA,
file = sprintf("%s/data/clean/wls_cleaned.RData", wd))
rm(list =ls()[grepl("wls", ls())])