Chapter 2 Data Cleaning

2.1 Add Health

The National Study of Adolescent to Adult Health (Ad Health; Harris & Udry, 2018) is an ongoing longitudinal study of adolescents in the United States that began as a response to a federal mandate to better understand adolescent health. The data are available online at https://www.icpsr.umich.edu/icpsrweb/DSDR/studies/21600.

The initial sample of participants included approximately 20,000 students who completed at home administrations the study. Four waves of data collection (1994-1995, 1996, 2001-2002, and 2008) have been completed. The latest release contains data through 2008. Another wave of collection began in 2016 but has not yet been released. More documentation of the data are available at https://www.icpsr.umich.edu/icpsrweb/content/DSDR/add-health-data-guide.html#intro.

Sample sizes vary by year, from 14,738 (1996) to 20,745 (1994-1995). This provides 99% power to detect a correlation effect size of ~.03.

2.1.1 Load Data

## # A tibble: 533 x 16
##    study  dataset category name   itemname  wave   year new_itemname  orig_itemname description  scale        reverse_code recode    mini  maxi comp_rule
##    <chr>  <chr>   <chr>    <chr>  <chr>     <chr> <dbl> <chr>         <chr>         <chr>        <chr>        <chr>        <chr>    <dbl> <dbl> <chr>    
##  1 addhe… SCH     match    adopt… adopted   SCH    1994 addhealth__m… S25           Adopted      ".\tmissing… <NA>         ifelse(…    NA    NA skip     
##  2 addhe… SCH     match    age    age       SCH    1994 addhealth__m… S1            Age          ".\tmissing… <NA>         ifelse(…    NA    NA skip     
##  3 addhe… SCH     match    alcoh… alc12mo   SCH    1994 <NA>          S59B          Drank Alcoh… ".\tmissing… <NA>         ifelse(…    NA    NA max      
##  4 addhe… SCH     match    alcoh… alcohol   SCH    1994 <NA>          S49           Have Had Al… ".\tmissing… <NA>         ifelse(…    NA    NA max      
##  5 addhe… SCH     match    alcoh… drunk12mo SCH    1994 <NA>          S59C          Got Drunk i… ".\tmissing… <NA>         ifelse(…    NA    NA max      
##  6 addhe… SCH     match    bioPa… livingBi… SCH    1994 <NA>          S26           Live with B… ".\tmissing… <NA>         ifelse(…    NA    NA skip     
##  7 addhe… SCH     match    birth… birthpla… SCH    1994 <NA>          S8            Born in Uni… ".\tmissing… <NA>         ifelse(…    NA    NA max      
##  8 addhe… Parent  match    birth… borninUS  Pare…  1995 <NA>          PC64          Child was b… ".\tmissing… <NA>         ifelse(…    NA    NA max      
##  9 addhe… Parent  match    birth… bornUS    Pare…  1995 <NA>          PA3           Born in Uni… ".\tmissing… <NA>         ifelse(…    NA    NA max      
## 10 addhe… Parent  match    brthw… birthwei… Pare…  1995 <NA>          PC19A_P       Child's bir… ".\tmissing… <NA>         ifelse(…    NA    NA sum      
## # … with 523 more rows

2.1.2 Matching Variables

# rename variables   
# rename variables   
addhealth_match <- addhealth_codebook %>%
  filter(category == "match") %>%
  select(name, itemname, wave, year, orig_itemname, reverse_code:comp_rule) %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(addhealth_long))) %>%
  unnest()

yrBrth <- addhealth_match %>%
  filter(name == "yearBrth") %>%
  mutate(yearBrth = value + 1900) %>%
  select(SID, yearBrth)

# recode 
recode_fun <- function(rule, y, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

addhealth_match <- addhealth_match %>% 
  filter(name != "age") %>% 
  left_join(yrBrth) %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

# reverse code 
addhealth_match <- addhealth_match %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, yearBrth, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

addhealth_waves <- p_waves %>% filter(Study == "Add Health") %>% select(Used) %>% distinct()

addhealth_match <- addhealth_match %>%
  filter(year <= max(addhealth_waves$Used)) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
         comp_rule = ifelse(comp_rule == "mx", "max", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  addhealth_match %>%
    filter((year <= 1996 | year <= p_year) & comp_rule == rule) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    distinct()
}

addhealth_match <- crossing(
  p_year = addhealth_waves$Used, 
  comp_rule = unique(addhealth_match$comp_rule)
  ) %>%
  mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>% 
  distinct() %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  spread(name, value) 
addhealth_match
## # A tibble: 19,515 x 120
##    p_year SID   physhlthevnt     A     C     DEP     E    IQ   LOC     N  `NA`    PA    SE   SWL adopted ageMarried alcohol bioParinHH birthplace
##     <dbl> <chr>        <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>      <dbl>   <dbl>      <dbl>      <dbl>
##  1   1995 5710…            0   3.5  3.5   0.5       NA  28     2      NA   0.5     0  5       NA       0         NA       1         NA          1
##  2   1996 5710…            0  NA   NA    NA         NA  NA    NA      NA  NA      NA NA       NA       0         NA       1         NA          1
##  3   2001 5710…           NA  NA   NA     1.75      NA  NA    NA      NA   2      NA  4.75     3       0         NA       1         NA          1
##  4   1995 5710…            0   3    2.75  0.611     NA  59.5   2      NA   0.5     0  3.83    NA       0         NA       1         NA          1
##  5   1996 5710…            0  NA   NA     0.167      4  NA     3.5     3   0       0  5       NA       0         NA       1         NA          1
##  6   2001 5710…           NA  NA   NA     1.25      NA  NA    NA      NA   1      NA  3.5      4       0         NA       1         NA          1
##  7   1995 5710…            0   4.5  3     0.0556    NA  79     3      NA   0       0  4.83    NA       0         NA       0         NA          1
##  8   1996 5710…            0  NA   NA    NA         NA  NA    NA      NA  NA      NA NA       NA       0         NA       0         NA          1
##  9   2001 5710…            0  NA   NA    NA         NA  NA    NA      NA  NA      NA NA       NA       0         NA       0         NA          1
## 10   1995 5710…            0   4.5  2     1.17      NA  57     5      NA   0.5     1  4.67    NA      NA         NA       0         NA          1
## # … with 19,505 more rows, and 101 more variables: brthweight <dbl>, childFrnd <dbl>, childHlthProb <dbl>, childTemper <dbl>, childTrustwrthy <dbl>,
## #   chldAgeHlthProb <dbl>, chldHlthIns <dbl>, chldHlthProb <dbl>, clubs <dbl>, dadAlc <dbl>, dadBornUS <dbl>, dadCare <dbl>, dadEdu <dbl>,
## #   dadEmployed <dbl>, dadHealthProb <dbl>, dadInHH <dbl>, date <dbl>, dentist <dbl>, disability <dbl>, drugs <dbl>, drVisits <dbl>,
## #   durBreastFeed <dbl>, employed <dbl>, everLiveBioDad <dbl>, everLiveBioMom <dbl>, exercise <dbl>, expAIDS <dbl>, expDie25 <dbl>,
## #   expGradCollege <dbl>, expInc <dbl>, expLive35 <dbl>, expMarried25 <dbl>, extracurricular <dbl>, friendship <dbl>, frndsGoodInf <dbl>, gender <dbl>,
## #   grade <dbl>, grades <dbl>, grsWages <dbl>, height <dbl>, HHsize <dbl>, hlthProb <dbl>, hlthSymp <dbl>, involvedSchWrk <dbl>, liveMom <dbl>,
## #   momAlc <dbl>, momAlive <dbl>, momBornUS <dbl>, momCare <dbl>, momEdu <dbl>, momEmployed <dbl>, momHealthProb <dbl>, momInHH <dbl>,
## #   moneyForBills <dbl>, nbhdQual <dbl>, numMarriages <dbl>, numSiblng <dbl>, ParDisabled <dbl>, parDivorce <dbl>, parExpGrad <dbl>,
## #   parMarriedAge <dbl>, parReligFreq <dbl>, parReligImp <dbl>, parReligion <dbl>, parReligScript <dbl>, parRelSat <dbl>, parRepHlth <dbl>,
## #   parSmokes <dbl>, parTalkSex <dbl>, ParTalkSex <dbl>, parUnemployBen <dbl>, physFunc <dbl>, PhysFunc <fct>, prntAlcohol <dbl>, prntWelfare <dbl>,
## #   race <dbl>, recoverFast <dbl>, relChld <dbl>, religFreq <dbl>, religImp <dbl>, religion <dbl>, religScript <dbl>, sameHouse <dbl>, satHH <dbl>,
## #   satSchl <dbl>, school <dbl>, skipClass <dbl>, smokes <dbl>, sports <dbl>, SRhealth <dbl>, twin <dbl>, Understandchild <dbl>, weapon <dbl>,
## #   weed <dbl>, weight <dbl>, yearBrth <dbl>, yearMoveUS <dbl>, yearsNoDad <dbl>, yearsNoMom <dbl>, age <dbl>, …

2.1.3 Personality Variables

## # A tibble: 105,398 x 4
##    SID      name   year value
##    <chr>    <chr> <dbl> <dbl>
##  1 57100270 A      1995  3.5 
##  2 57100270 C      1995  3.5 
##  3 57100270 DEP    1995  0.5 
##  4 57100270 DEP    2001  1.75
##  5 57100270 IQ     1995 28   
##  6 57100270 LOC    1995  2   
##  7 57100270 NA     1995  0.5 
##  8 57100270 NA     2001  2   
##  9 57100270 PA     1995  0   
## 10 57100270 SE     1995  5   
## # … with 105,388 more rows

2.1.4 Outcome Variables

# missing all edu 2008 variables
addhealth_out <- addhealth_codebook %>% 
  filter(category == "out") %>%
  select(name, itemname, wave, year, orig_itemname, reverse_code:comp_rule) %>%
  left_join(addhealth_long) %>%
  distinct() %>%
  full_join(crossing(p_year = addhealth_waves$Used, name = unique((.)$name))) 
# recode
recode_fun <- function(rule, y, p_year, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

addhealth_out <- addhealth_out %>%
  left_join(yrBrth) %>%
  select(-orig_itemname) %>%
  group_by(recode, year, p_year) %>%
  nest() %>%
  ungroup() %>%
  mutate(year = as.numeric(year),
         data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
  unnest(data) %>%
  distinct()

# composite within years 
# compositing within years
addhealth_out <- addhealth_out %>%
  group_by(SID, name, year, p_year) %>%
  summarize(value = max(value)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))

addhealth_out <- addhealth_out %>%
  mutate(group = ifelse(year > p_year, "future", "past")) %>%
    group_by(SID, name, group, p_year) %>%
    summarize(value = max(value)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) %>%
    ungroup() %>%
  mutate(value = ifelse(name == "crim" & is.na(value), 0, value))
## # A tibble: 198,117 x 4
##    SID      name     p_year value
##    <chr>    <chr>     <dbl> <dbl>
##  1 57100270 chldbrth   1995     0
##  2 57100270 chldbrth   1996     0
##  3 57100270 chldbrth   2001     0
##  4 57100270 crim       1995     0
##  5 57100270 crim       1996     0
##  6 57100270 crim       2001     0
##  7 57100270 divorced   1995     0
##  8 57100270 divorced   1996     0
##  9 57100270 divorced   2001     0
## 10 57100270 edu        1995     0
## # … with 198,107 more rows

2.1.5 Covariates

## # A tibble: 19,515 x 16
##    SID      p_year   age gender grsWages  race physhlthevnt SRhealth smokes alcohol exercise   BMI parDivorce PhysFunc religion parEdu
##    <chr>     <dbl> <dbl>  <dbl>    <dbl> <dbl>        <dbl>    <dbl>  <dbl>   <dbl>    <dbl> <dbl>      <dbl>    <int>    <dbl>  <dbl>
##  1 57100270   1995    18      1    55000     2            0        4      1       1        2  7.58          0        1        2      1
##  2 57100270   1996    19      1    55000     2            0        4      1       1        2  7.58          0        1        2      1
##  3 57100270   2001    24      1    55000     2           NA        4      1       1        2  7.58          0        1        2      1
##  4 57101310   1995    19      1       NA     2            0       NA      1       1        1  3.64         NA        1        0      0
##  5 57101310   1996    20      1       NA     2            0       NA      1       1        1  3.64         NA        1        0      0
##  6 57101310   2001    25      1       NA     2           NA       NA      1       1        1  3.64         NA        1        0      0
##  7 57103171   1995    16      0    45000     0            0        3      0       0        2 NA             0        1        1      0
##  8 57103171   1996    17      0    45000     0            0        3      0       0        2 NA             0        1        1      0
##  9 57103171   2001    22      0    45000     0            0        3      0       0        2 NA             0        1        1      0
## 10 57103869   1995    18      0     9000    NA            0        1     NA       0       NA  5.38          0        1        1     NA
## # … with 19,505 more rows

2.1.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2} else{F}}
not_all_na <- function(x) any(!is.na(x))
mice_fun <- function(df){
  df <- df %>% select(-bioParinHH, -ageMarried, -chldAgeHlthProb, -childHlthProb, -dadInHH,-momInHH, -parReligion, -liveMom, -yearsNoMom, -yearsNoDad, -yearMoveUS)
  mice(df, m = 5, maxit=5, printFlag=TRUE)
}
start <- Sys.time()
addhealth_match_imp <- addhealth_match %>%
  rename(NegAff = `NA`) %>%
  mutate_if(factor_fun, as.factor) %>%
  group_by(p_year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)
print(Sys.time() - start)

addhealth_match_imp <- addhealth_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% select(-PhysFunc)  %>% mutate(PhysFunc = ifelse(physFunc > 0, 1, 0))),
         imp_df = map(imp_df, ~(.) %>%
  group_by(SID) %>%
  mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
  ungroup() %>% select(-momEdu, -dadEdu)),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

addhealth_match_imp_long <- addhealth_match_imp %>%
  select(p_year, imp_df) %>%
  unnest(imp_df)

addhealth_SCA_imp <- addhealth_match_imp %>%
  select(p_year, imp_sca) %>%
  unnest(imp_sca) %>%
  left_join(addhealth_SCA %>% select(p_year, SID,
        colnames(addhealth_SCA)[!colnames(addhealth_SCA) %in% colnames(.)])) %>%
  mutate(education = factor(0), married = factor(0), numKids = 0, PhysFunc = factor(PhysFunc))

2.2 US

2.2.1 Load Data

## # A tibble: 235 x 17
##    study dataset  category name  itemname  wave waveletter  year new_itemname orig_itemname description  scale reverse_code recode  mini  maxi  comp_rule
##    <chr> <chr>    <chr>    <chr> <chr>    <dbl> <chr>      <dbl> <chr>        <chr>         <chr>        <chr> <lgl>        <chr>   <lgl> <lgl> <chr>    
##  1 US    xwavedat out      chld… chldbrth     0 <NA>           0 US__out_chl… ch1by_dv      Can you ple… Year… NA           ifelse… NA    NA    max      
##  2 US    a_INDRE… out      chld… chldbrth     1 a           2009 US__out_chl… a_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
##  3 US    b_INDRE… out      chld… chldbrth     2 b           2010 US__out_chl… b_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
##  4 US    c_INDRE… out      chld… chldbrth     3 c           2011 US__out_chl… c_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
##  5 US    d_INDRE… out      chld… chldbrth     4 d           2012 US__out_chl… d_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
##  6 US    e_INDRE… out      chld… chldbrth     5 e           2013 US__out_chl… e_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
##  7 US    f_INDRE… out      chld… chldbrth     6 f           2014 US__out_chl… f_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
##  8 US    g_INDRE… out      chld… chldbrth     7 g           2015 US__out_chl… g_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
##  9 US    h_INDRE… out      chld… chldbrth     8 h           2016 US__out_chl… h_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
## 10 US    i_INDRE… out      chld… chldbrth     9 i           2017 US__out_chl… i_ch1by4      Can you ple… Year… NA           ifelse… NA    NA    max      
## # … with 225 more rows

2.2.2 Outcome Variables

## # A tibble: 9,282,789 x 10
##    recode                                                 name     itemname  year reverse_code mini  maxi  comp_rule      PID value
##    <chr>                                                  <chr>    <chr>    <dbl> <lgl>        <lgl> <lgl> <chr>        <dbl> <dbl>
##  1 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68001367     0
##  2 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68004087     0
##  3 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68006127     0
##  4 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68006135     0
##  5 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68006807     0
##  6 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68007487     0
##  7 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68007491     0
##  8 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68007495     0
##  9 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68007499     0
## 10 ifelse(x < 0 & !is.na(x), 0, ifelse(x >  2008, 1, NA)) chldbrth chldbrth  2009 NA           NA    NA    max       68008167     0
## # … with 9,282,779 more rows

2.3 BHPS

The British Household Panel Study (BHPS; University of Essex, 2018) is a longitudinal study of households in the United Kingdom. These data are available online, through application, from https://www.iser.essex.ac.uk/bhps/about/latest-release-of-bhps-data.

Participants were recruited from more than 15,000 individuals from approximately 8,000 households in the United Kingdom. Data have been collected annually since 1991 from approximately 10,000 individuals (5,500 households) in Great Britain but expanded to include Scotland and Wales in 1999 and Northern Ireland in 2001. In 2010, the BHPS stopped data collection, but 6,700 of the current 8,000 participants were solicited to become part of the broader Understanding Society study (University of Essex, 2019). Participants can be matched across studies, so I will use additional data on the original BHPS participants from the Understanding Society study for additional waves of outcome data.

Sample sizes vary by year, ranging from 10,264 (1991) to 14419 (2008). This provides 99% power to detect a zero-order correlation effect size of ~.05, two-tailed at alpha .05.

2.3.1 Load Data

## # A tibble: 1,878 x 23
##    study dataset category name  itemname  wave waveletter  year new_itemname orig_itemname description scale reverse_code recode  mini  maxi comp_rule
##    <chr> <chr>   <chr>    <chr> <chr>    <dbl> <chr>      <dbl> <chr>        <chr>         <chr>       <chr> <chr>        <chr>  <dbl> <dbl> <chr>    
##  1 bhps  aINDRE… match    acci… numAcci…     1 a           1991 bhps__match… anxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  2 bhps  bINDRE… match    acci… numAcci…     2 b           1992 bhps__match… bnxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  3 bhps  cINDRE… match    acci… numAcci…     3 c           1993 bhps__match… cnxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  4 bhps  dINDRE… match    acci… numAcci…     4 d           1994 bhps__match… dnxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  5 bhps  eINDRE… match    acci… numAcci…     5 e           1995 bhps__match… enxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  6 bhps  fINDRE… match    acci… numAcci…     6 f           1996 bhps__match… fnxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  7 bhps  gINDRE… match    acci… numAcci…     7 g           1997 bhps__match… gnxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  8 bhps  hINDRE… match    acci… numAcci…     8 h           1998 bhps__match… hnxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
##  9 bhps  iINDRE… match    acci… numAcci…     9 i           1999 bhps__match… inxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
## 10 bhps  jINDRE… match    acci… numAcci…    10 j           2000 bhps__match… jnxdts        No. of ser… "Val… <NA>         ifels…    NA    NA max      
## # … with 1,868 more rows, and 6 more variables: ...18 <chr>, ...19 <chr>, ...20 <chr>, ...21 <chr>, ...22 <chr>, ...23 <chr>

2.3.2 Matching Variables

recode_fun <- function(rule, y, year){
  print(rule)
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# rename variables   
bhps_match <- bhps_codebook %>%
  filter(category == "match") %>%
  select(-wave, -waveletter) %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(bhps_long))) 

yrBrth  <- bhps_codebook %>%
  filter(name == "yearBrth") %>% 
  left_join(bhps_long) %>%
  filter(!is.na(value)) %>%
  group_by(SID) %>%
  summarize(yearBrth = Mode(value)) %>%
  ungroup() 

bhps_match <- bhps_match %>% 
  filter(name != "yearBrth") %>% 
  mutate(data = map(data, ~(.) %>% full_join(yrBrth)))

# recode 
bhps_match <- bhps_match %>%
  mutate(data = map(data, ~(.) %>%
    select(SID, year, recode, itemname, value, yearBrth) %>%
    distinct() %>%
    group_by(recode, year, itemname) %>% 
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
    unnest(data)))

bhps_htwt <- bhps_match %>% 
  filter(name %in% c("height", "weight")) %>% 
  unnest(data) %>%
  filter(!is.na(value)) %>%
  select(year, name, itemname, value, SID) %>% 
  group_by(SID, itemname, year) %>% 
  summarize(value = mean(value, na.rm = T)) %>%  
  ungroup() %>% 
  spread(itemname, value) %>%
  # filter(weightStn != 0) %>%
  mutate(height = rowSums(cbind(heightft, heightin), na.rm = T),
         weight = ifelse(is.na(weightStn), weightkg, rowSums(cbind(weightlbs, weightStn), na.rm = T)),
         height = height/100) %>%
  filter(height > .5 & height < 2.5 & weight > 20 & weight < 150) %>%
  mutate(BMI = weight / (height)^2,
         BMI = ifelse(is.infinite(BMI), NA, BMI)) %>%
  select(-heightft, -heightin, -weightlbs, -weightStn, -weightkg, -year) %>%
  gather(key = name, value = value, height, weight, BMI) %>%
  group_by(SID, name) %>%
  summarize(value = mean(value, na.rm = T)) %>% 
  ungroup() %>%
  spread(name, value) %>%
  mutate_all(~ifelse(is.infinite(.) | is.nan(.), NA, .))

bhps_relig <- bhps_codebook %>%
  filter(name == "religion") %>%
  left_join(bhps_long) %>%
  group_by(year, recode) %>%
  nest() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data) %>%
  ungroup() %>%
  filter(!is.na(value)) %>%
  group_by(SID, name) %>%
  summarize(value = Mode(value))
  

# reverse code 
bhps_match <- bhps_match %>% 
  mutate(data = map(data, ~(.) %>%
    select(-recode) %>%
    left_join(bhps_codebook %>% select(year, itemname, reverse_code, mini:comp_rule)) %>%
    mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi)))))

parIDs <- bhps_match %>% 
  filter(name %in% c("momID", "dadID")) %>% 
  unnest(data) %>%
  select(SID, year, parID = name, value) %>%
  distinct() %>%
  filter(value != 0) %>%
  group_by(SID, parID) %>%
  filter(year == min(year)) %>%
  ungroup() %>%
  spread(parID, value) %>%
  mutate_at(vars(momID, dadID),
     ~mapvalues(., bhps_xwaveid$pid, bhps_xwaveid$pidp, warn_missing = F))

parWages <- bhps_match %>% 
  filter(name == "grsWages") %>%
  unnest(data) %>%
  rename(momID = SID) %>%
  right_join(parIDs %>% select(momID, SID)) %>%
  filter(!is.na(value)) %>%
  mutate(itemname = "momGrsWages") %>%
  select(-(reverse_code:maxi), -momID) %>%
  full_join(
bhps_match %>% 
  filter(name == "grsWages") %>%
  unnest(data) %>%
  rename(dadID = SID) %>%
  right_join(parIDs %>% select(dadID, SID)) %>%
  filter(!is.na(value)) %>%
  mutate(itemname = "dadGrsWages") %>%
  select(-(reverse_code:maxi), -dadID)
  ) %>%
  mutate(name = "parGrsWages") %>%
  group_by(year, name,  SID) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  group_by(SID) %>%
  summarize(parGrsWages = mean(value, na.rm = T)) %>%
  ungroup()

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  rule <- ifelse(is.na(rule), "skip", rule)
  df %>%
    group_by(SID, yearBrth, year) %>% # group by person and item (collapse across age)
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) 
}

bhps_waves <- p_waves %>% filter(Study == "BHPS") %>% select(Used) %>% distinct()

bhps_match <- bhps_match %>%
  mutate(data = map(data, ~(.) %>%
    left_join(yrBrth) %>%
    filter(year <= max(bhps_waves$Used)) %>%
    group_by(comp_rule) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
    unnest(data))) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  rule <- ifelse(is.na(rule), "skip", rule)
  bhps_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

bhps_match <- crossing(
  p_year = bhps_waves$Used, 
  comp_rule = unique(bhps_match$comp_rule)
  ) %>%
  mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  select(-comp_rule) %>%
  filter(name  != "HHID") %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = ~max(., na.rm = T))) %>%
  mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
  full_join(bhps_htwt) %>%
  full_join(parWages) %>%
  filter(!is.na(p_year))
## # A tibble: 95,699 x 67
##      SID p_year     A     C   DEP     E   LOC     N     O    OP    SE    SS   SWB   SWL yearBrth drVisits genderRoles grsWages nbhdQual SRhealth
##    <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>       <dbl>    <dbl>    <dbl>    <dbl>
##  1  9527   1996    NA    NA  1.83    NA    NA    NA    NA    NA     1    NA    10     4     1961     2.33        4.11   13186.    0.833     3.17
##  2 12251   1996    NA    NA  1.83    NA    NA    NA    NA    NA     1    NA    10     5     1971     3.67        3.89   10485.    1         3.17
##  3 12935   1996    NA    NA  1       NA    NA    NA    NA    NA     1    NA     0     4     1973     2.4         4      13109.    0.8       4.8 
##  4 14287   1996    NA    NA  1.92    NA    NA    NA    NA    NA     2    NA    11     5     1961     3.5         3.22   10948.    0.167     2.5 
##  5 14971   1996    NA    NA  2.75    NA    NA    NA    NA    NA     3    NA    21     2     1962     1.83        3.22   10948.    0.167     3.17
##  6 15645   1996    NA    NA  2.5     NA    NA    NA    NA    NA     3    NA    18     2     1956     2           2.67   14686.    1         4   
##  7 16339   1996    NA    NA NA       NA    NA    NA    NA    NA    NA    NA    NA    NA     1972     2.6         4.11   15111.    1         4.83
##  8 17687   1996    NA    NA  1.75    NA    NA    NA    NA    NA     1    NA     9     6     1969     2.5         4.89   11266.    0.833     4.17
##  9 21087   1996    NA    NA  1.92    NA    NA    NA    NA    NA     1    NA    11     3     1964     2.33        3.44   28184.    1         4.17
## 10 21767   1996    NA    NA  2.08    NA    NA    NA    NA    NA     1    NA    13     3     1945     3.17        4.89   29317.    1         3.33
## # … with 95,689 more rows, and 47 more variables: religiosity <dbl>, exercise <dbl>, satFam <dbl>, satHH <dbl>, accidents <dbl>, disability <dbl>,
## #   education <dbl>, employed <dbl>, homeHelp <dbl>, married <dbl>, smokes <dbl>, unempBen <dbl>, welfare <dbl>, mvdBigSlry <dbl>, mvdForWork <dbl>,
## #   numKids <dbl>, debt <dbl>, investments <dbl>, savings <dbl>, PhysFunc <dbl>, dadID <dbl>, dentalIns <dbl>, dentist <dbl>, domesticDuties <dbl>,
## #   eyeDr <dbl>, eyeDrIns <dbl>, gender <dbl>, momID <dbl>, prvntvHlthChcks <dbl>, neighbors <dbl>, satSchl <dbl>, ageMarried <dbl>, weight <dbl>,
## #   height <dbl>, momAgeAtBrth <dbl>, numSiblng <dbl>, BMI <dbl>, parGrsWages <dbl>, physhlthevnt <dbl>, age <dbl>, dadEdu <dbl>, dadOccPrstg <dbl>,
## #   momEdu <dbl>, momOccPrstg <dbl>, parDivorce <dbl>, race <dbl>, religion <dbl>

2.3.3 Personality Variables

# keep correct personality waves
bhps_pers <- bhps_codebook %>% mutate(reverse_code = tolower(reverse_code)) %>%
  select(-new_itemname) %>%
  filter(category == "pers") %>%
  left_join(bhps_long) %>%
  left_join(p_waves %>% filter(Study == "BHPS") %>% select(name = p_item, Used)) %>%
  filter(year %in% bhps_waves$Used) %>%
  distinct()

recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# recode 
bhps_pers <- bhps_pers %>%
  select(name, itemname, year, reverse_code:comp_rule, SID:value) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data)

# reverse code 
bhps_pers <- bhps_pers %>% distinct() %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi)))


# alpha's
bhps_alpha <- bhps_pers %>%
  select(name, itemname, year, SID, value) %>%
  group_by(name, year) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% distinct() %>% pivot_wider(names_from = itemname, values_from = value)),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

comp_fun <- function(df, rule){
  df %>%
    group_by(SID, HHID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

# create composites
bhps_pers <- bhps_pers %>%
  group_by(name, comp_rule, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, comp_fun)) %>%
  unnest(data) %>%
  select(-comp_rule, -HHID)
## # A tibble: 291,507 x 4
##    name   year   SID value
##    <chr> <dbl> <dbl> <dbl>
##  1 A      2005 14971  4   
##  2 A      2005 15645  6   
##  3 A      2005 16339 NA   
##  4 A      2005 17015  5.67
##  5 A      2005 22445  5.33
##  6 A      2005 23807  5.33
##  7 A      2005 27284  4   
##  8 A      2005 28575  3.67
##  9 A      2005 30615  5.67
## 10 A      2005 46927  4.67
## # … with 291,497 more rows

2.3.4 Outcome Variables

bhps_subs <- unique(bhps_pers$SID)

us_out <- us_out %>% filter(PID %in% bhps_subs) 

exit <- sprintf("%s/data/bhps/xwlsten.sav", wd) %>% haven::read_sav(.) %>%
  haven::zap_labels(.) %>%
  select(SID = pidp, lrio, lewave) %>%
  mutate(lrio = ifelse(lrio == 99, 1990 + lewave, NA)) %>%
  full_join(crossing(SID = unique((.)$SID), p_year = bhps_waves$Used)) %>%
  # filter(!value < 0)
  mutate(year = ifelse(is.na(lrio) | lrio < 0, NA, lrio),
         value = ifelse(year <= p_year, NA, ifelse(is.na(year),  0, ifelse(year > p_year, 1, NA)))) %>%
  select(-year, -lrio, -lewave) %>% mutate(name = "mortality")

bhps_out <- bhps_codebook %>% 
  select(-new_itemname) %>%
  filter(category == "out") %>%
  left_join(bhps_long) 

recode_fun <- function(rule, y, p_year){
  print(rule)
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# recode
bhps_out <- bhps_out %>%
  select(name, itemname, year, reverse_code:comp_rule, SID:value) %>%
  full_join(crossing(name = unique((.)$name), p_year = bhps_waves$Used)) %>%
  group_by(recode, p_year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) 

# composite within years 
# compositing within years
bhps_out <- bhps_out %>%
  select(-(reverse_code:maxi), -HHID, -recode) %>%
  full_join(us_out %>% select(p_year:year, comp_rule, SID = PID, value)) %>%
  group_by(SID, name, year, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))

# composite across years
comp_fun <- function(p_year){
  bhps_out %>%
    mutate(group = ifelse(year > p_year, "future", "past")) %>%
    group_by(SID, name, group,) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

bhps_out <- bhps_out %>%
  mutate(group = ifelse(year > p_year, "future", "past")) %>%
  group_by(SID, name, group, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value), NA, value)) %>%
  pivot_wider(names_from = group, values_from = value) %>%
  group_by(SID, name, p_year) %>%
  mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                 ifelse(past == 0 & is.na(future), past, 
                 ifelse(past == 1, NA, NA)))) %>%
  ungroup() 

bhps_out <- bhps_out %>%
  filter(name != "mortality") %>%
  full_join(
    bhps_out %>% 
      filter(name == "mortality") %>%
      full_join(exit) %>% 
      distinct() %>%
      group_by(SID, name, p_year) %>%
      summarize(value = max(value, na.rm = T)) %>%
      ungroup() %>%
      mutate(value = ifelse(is.infinite(value), NA, value)) 
  )
## # A tibble: 1,154,787 x 6
##      SID name         p_year  past future value
##    <dbl> <chr>         <dbl> <dbl>  <dbl> <dbl>
##  1   687 divorced       1996     0     NA     0
##  2   687 divorced       2001     0     NA     0
##  3   687 divorced       2005     0     NA     0
##  4   687 edu            1996     0     NA     0
##  5   687 edu            2001     0     NA     0
##  6   687 edu            2005     0     NA     0
##  7   687 frstjob        1996     1     NA    NA
##  8   687 frstjob        2001     1     NA    NA
##  9   687 frstjob        2005     1     NA    NA
## 10   687 mntlhlthevnt   1996     0     NA     0
## # … with 1,154,777 more rows

2.3.5 Covariates

bhps_match <- bhps_pers %>%
  spread(name, value) %>% 
  rename(p_year = year) %>%
  full_join(bhps_match) %>%
  # physical health event
  left_join(bhps_out %>% filter(name == "physhlthevnt") %>% select(p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, p_year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
  distinct()

# parental divorce 
bhps_match <- bhps_out %>% 
  filter(name == "divorced") %>%
  rename(momID = SID) %>%
  select(-name) %>%
  right_join(parIDs %>% select(momID, SID)) %>%
  filter(!is.na(past)) %>%
  mutate(itemname = "momDivorced", name = "parDivorce") %>%
  select(p_year, SID, name, value = past, itemname) %>%
  full_join(
bhps_out %>% 
  filter(name == "divorced") %>%
  rename(dadID = SID) %>%
  right_join(parIDs %>% select(dadID, SID)) %>%
  filter(!is.na(past)) %>%
  mutate(itemname = "dadDivorced", name = "parDivorce") %>%
  select(p_year, SID, name, value = past, itemname)
  ) %>%
  group_by(p_year, SID, name) %>%
  summarize(parDivorce = max(value, na.rm = T)) %>% 
  ungroup() %>%
  select(-name) %>%
  right_join(bhps_match)

bhps_match <- bhps_codebook %>%
  filter(grepl("OccPrstg", name)) %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(bhps_long))) %>%
  unnest(data) %>%
  mutate(value = ifelse(value < 0, NA, value)) %>%
  filter(!is.na(value)) %>%
  group_by(SID, name) %>%
  filter(year == min(year)) %>%
  select(name, SID, value) %>%
  distinct() %>%
  spread(name, value) %>%
  right_join(bhps_match) %>%
  ungroup() %>%
  filter(!is.na(p_year)) %>%
  # select(-comp_rule, -HHID) %>%
  mutate_all(~ifelse(is.infinite(.), NA, .))

bhps_out <- bhps_out %>%
  select(-past, -future) %>%
  distinct()

bhps_dem <- bhps_match %>% 
  select(SID, dadOccPrstg, momOccPrstg, parDivorce, 
         race = ethnicity, momEdu, dadEdu)%>%
  gather(key = item, value = value, -SID, na.rm = T) %>%
  group_by(SID, item) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  spread(item,value) %>%
  full_join(bhps_relig %>% spread(name, value))

# age
bhps_match <- bhps_match %>%
  mutate(age = p_year - yearBrth) %>%
  select(-dadOccPrstg, -momOccPrstg, -parDivorce, 
         -ethnicity, -momEdu, -dadEdu, -religion) %>%
  full_join(bhps_dem)


bhps_SCA <- bhps_match %>% 
  select(SID, p_year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
  group_by(SID) %>%
  mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
         parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
  ungroup() %>% 
  select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)
## # A tibble: 95,699 x 19
##      SID p_year   age education gender grsWages  race physhlthevnt SRhealth smokes exercise   BMI married numKids parDivorce PhysFunc religion parEdu
##    <dbl>  <dbl> <dbl>     <dbl>  <dbl>    <dbl> <dbl>        <dbl>    <dbl>  <dbl>    <dbl> <dbl>   <dbl>   <dbl>      <dbl>    <dbl>    <dbl>  <dbl>
##  1  9527   1996    35         0      1   13186.     0            1     3.17      1      3.5    NA       0       1         NA       NA       NA     NA
##  2 12251   1996    25         0      1   10485.     2            1     3.17      0      1      NA       0      NA          1       NA       NA      0
##  3 12935   1996    23         0      1   13109.     1            0     4.8       1      3      NA       0      NA          1       NA       NA     NA
##  4 14287   1996    35         0      0   10948.     0            1     2.5       1      1      NA       1       1         NA       NA       NA     NA
##  5 14971   1996    34         0      1   10948.     1            1     3.17      1      2      NA       1       1         NA       NA       NA      1
##  6 15645   1996    40         1      1   14686.     0            1     4         0      3      NA       1       3         NA       NA       NA      0
##  7 16339   1996    24         0      1   15111.     0            0     4.83      0     NA      NA       0      NA          0       NA       NA     NA
##  8 17687   1996    27         0      1   11266.     0            0     4.17      1      3      NA       0       1         NA       NA       NA     NA
##  9 21087   1996    32         0      1   28184.     1            1     4.17      1      3.5    NA       0      NA         NA       NA       NA     NA
## 10 21767   1996    51         2      1   29317.     0            0     3.33      0      5      NA       0      NA         NA       NA       NA     NA
## # … with 95,689 more rows, and 1 more variable: parOccPrstg <dbl>

2.3.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2} else{F}}
not_all_na <- function(x) any(!is.na(x))
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  df <- df %>% select(-homeHelp, -dentalIns, -eyeDrIns)
  mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
}
start <- Sys.time()
bhps_match_imp <- bhps_match %>%
  group_by(SID, p_year) %>%
  mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  ungroup() %>%
  # filter(p_year==2001) %>%
  select(everything(), -dadID, -momID, -momOccPrstg,-dadOccPrstg) %>%
  filter(!is.na(p_year) & !is.na(SID) & !is.na(yearBrth)) %>%
  mutate(parOccPrstg = ifelse(is.infinite(parOccPrstg), NA, parOccPrstg)) %>%
  group_by(p_year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)
print(Sys.time() - start)

bhps_match_imp <- bhps_match_imp %>%
  filter(year != 1991) %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

bhps_match_imp_long <- bhps_match_imp %>%
  select(p_year, imp_df) %>%
  unnest(imp_df) %>%
  select(-SWB)

bhps_SCA_imp <- bhps_match_imp %>%
  select(p_year, imp_sca) %>%
  unnest(imp_sca) %>%
  full_join(bhps_SCA %>% select(p_year = year, SID, BMI))
unique(specifications$name)[!unique(specifications$name) %in% colnames(bhps_SCA_imp)]

2.4 GSOEP

The German Socioeconomic Panel Study (GSOEP; Socio-Economic Panel, 2017) is an ongoing longitudinal study of German collected by the German Institute of Economic Research (DIW Berlin). The data are freely available at https://www.diw.de/soep by application.

Data have been collected annually since 1984 (the latest data release includes data up to 2017). Participants have been recruited from more than 11,000 households, which are nationally representative of private German households. 20,000 individuals are sampled each year, on average. It is critical to note that the GSOEP samples households, not individuals, and the households consist of individuals living in both the “old” and “new” federal states (the former West and East Germany), foreigners, and recent immigrants to Germany.

Sample size varies by year, ranging from approximately 10,000 (1989) to 31,000 (2013). This provides 99% power to detect a zero-order correlation effect size of ~.06, two-tailed at alpha < .05.

2.4.1 Load Data

## # A tibble: 1,843 x 20
##    study dataset category name  itemname  wave waveletter  year new_itemname orig_itemname description scale reverse_code recode  mini  maxi comp_rule
##    <chr> <chr>   <chr>    <chr> <chr>    <dbl> <chr>      <dbl> <chr>        <chr>         <chr>       <chr> <chr>        <chr>  <dbl> <dbl> <chr>    
##  1 gsoep apequiv match    age   age          1 a           1984 gsoep__matc… d1110184      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  2 gsoep bpequiv match    age   age          2 b           1985 gsoep__matc… d1110185      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  3 gsoep cpequiv match    age   age          3 c           1986 gsoep__matc… d1110186      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  4 gsoep dpequiv match    age   age          4 d           1987 gsoep__matc… d1110187      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  5 gsoep epequiv match    age   age          5 e           1988 gsoep__matc… d1110188      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  6 gsoep fpequiv match    age   age          6 f           1989 gsoep__matc… d1110189      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  7 gsoep gpequiv match    age   age          7 g           1990 gsoep__matc… d1110190      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  8 gsoep hpequiv match    age   age          8 h           1991 gsoep__matc… d1110191      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
##  9 gsoep ipequiv match    age   age          9 i           1992 gsoep__matc… d1110192      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
## 10 gsoep jpequiv match    age   age         10 j           1993 gsoep__matc… d1110193      Age of Ind… <NA>  <NA>         ifels…    NA    NA skip     
## # … with 1,833 more rows, and 3 more variables: ...18 <dbl>, ...19 <lgl>, ...20 <lgl>

2.4.2 Matching Variables

recode_fun <- function(rule, y, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# rename variables   
gsoep_match <- gsoep_codebook %>%
  filter(name == "yearBrth") %>%
  filter(category == "match") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(gsoep_long))) 

yrBrth <- gsoep_match %>% 
  filter(name == "yearBrth") %>% 
  unnest(data) %>%
  group_by(persnr) %>% 
  summarize(yearBrth = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth)) 

# recode 
gsoep_match <- gsoep_match %>% 
  filter(name != "yearBrth") %>%
  mutate(data = map(data, ~(.) %>%
    left_join(yrBrth) %>%
    group_by(recode, year) %>% 
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
    unnest(data)))

# reverse code 
gsoep_match <- gsoep_match %>%
  mutate(data = map(data, ~(.) %>%
    mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
                          reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    group_by(persnr, yearBrth, year) %>% # group by person and item (collapse across age)
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()

gsoep_match <- gsoep_match %>%
  mutate(data = map(data, ~(.) %>%
    filter(year <= max(gsoep_waves$Used)) %>%
    group_by(comp_rule) %>%
    nest() %>%
    ungroup() %>%
    mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
           data = map2(data, comp_rule, year_comp_fun)) %>%
    unnest(data))) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  gsoep_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    group_by(persnr, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

gsoep_match <- crossing(
  p_year = gsoep_waves$Used, 
  comp_rule = unique(gsoep_match$comp_rule)
  ) %>%
  mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule, -yearBrth) %>%
  pivot_wider(names_from = name, values_from = value) 
## # A tibble: 346,921 x 62
##     year   SID     A     C   DEP     E   LOC     N  `NA`     O    OP    PA    SE    SS   SWL drVisits exercise grsWages satHealth satHH satIncome social
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>    <dbl>     <dbl> <dbl>     <dbl>  <dbl>
##  1  1994   201    NA    NA    NA    NA  2.62    NA    NA    NA    NA    NA    NA    NA     5    20        1       2979.       6.5  3.5       5.36   2.71
##  2  1994   203    NA    NA    NA    NA  3.25    NA    NA    NA    NA    NA    NA    NA     6     3.5      4       5054.       9.5  3.5       6.5    1.42
##  3  1994   204    NA    NA    NA    NA NA       NA    NA    NA    NA    NA    NA    NA    NA    NA       NA       2599.      NA    3.88     NA     NA   
##  4  1994   601    NA    NA    NA    NA  3.5     NA    NA    NA    NA    NA    NA    NA     9     9.33     3.86   31259.       7.4  2.83      7.18   1.93
##  5  1994   602    NA    NA    NA    NA  3       NA    NA    NA    NA    NA    NA    NA     4     9.33     2.86   35436.       6.6  2.83      7.27   1.79
##  6  1994   603    NA    NA    NA    NA NA       NA    NA    NA    NA    NA    NA    NA    NA    NA       NA      66717.      NA    3.08     NA     NA   
##  7  1994   604    NA    NA    NA    NA NA       NA    NA    NA    NA    NA    NA    NA    NA    NA       NA      69202.      NA    3.62     NA     NA   
##  8  1994   901    NA    NA    NA    NA  3.12    NA    NA    NA    NA    NA    NA    NA     6    18.2      3.71   13774.       6.6  3.33      5.91   2.07
##  9  1994  1101    NA    NA    NA    NA  2.25    NA    NA    NA    NA    NA    NA    NA    10    29.8      1       1940.       3.7  4         7.18   2.29
## 10  1994  1202    NA    NA    NA    NA  2.62    NA    NA    NA    NA    NA    NA    NA     9    21.8      1       2989.       5.1  3.6       8      2.71
## # … with 346,911 more rows, and 40 more variables: SRhealth <dbl>, nbhdQual <dbl>, religiosity <dbl>, disability <dbl>, education <dbl>, HHsize <dbl>,
## #   hlthcare <dbl>, married <dbl>, numKids <dbl>, PhysFunc <dbl>, unempBen <dbl>, urban <dbl>, broPres <dbl>, dadPres <dbl>, momPres <dbl>,
## #   religion <dbl>, sisPres <dbl>, ageMarried <dbl>, gender <dbl>, age <dbl>, occPrestige <dbl>, height <dbl>, smokes <dbl>, weight <dbl>,
## #   welfare <dbl>, eatingHabits <dbl>, dadAlive <dbl>, momAlive <dbl>, alcohol <dbl>, satFam <dbl>, auntPres <dbl>, gmaPres <dbl>, gpaPres <dbl>,
## #   unclPres <dbl>, dadEdu <dbl>, dadOccPrstg <dbl>, momEdu <dbl>, momOccPrstg <dbl>, physhlthevnt <dbl>, BMI <dbl>

2.4.3 Personality Variables

## # A tibble: 504,605 x 4
##    name   year   SID value
##    <chr> <dbl> <dbl> <dbl>
##  1 A      2005   201  7   
##  2 A      2005   203  4.67
##  3 A      2005   602  4.33
##  4 A      2005   901  4.67
##  5 A      2005  1202  7   
##  6 A      2005  1501  4.67
##  7 A      2005  1601  4   
##  8 A      2005  1602  6.67
##  9 A      2005  1603  4.33
## 10 A      2005  1701  7   
## # … with 504,595 more rows

2.4.4 Outcome Variables

gsoep_pers_subs <- unique(gsoep_pers$persnr)
gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()
exit <- sprintf("%s/data/gsoep/ppfad.sav", wd) %>% haven::read_sav(.) %>%
  select(persnr, todjahr) %>%
  gather(key = orig_itemname, value = value, -persnr) %>%
  full_join(crossing(orig_itemname = "todjahr", p_year = gsoep_waves$Used)) %>%
  # filter(!value < 0)
  mutate(year = ifelse(is.na(value) | value < 0, NA, value),
         value = ifelse(year <= p_year | is.na(year), 0, ifelse(year > p_year, 1, NA))) %>%
  left_join(gsoep_codebook %>% select(orig_itemname, name)) %>% 
  select(-year, -orig_itemname) %>%
  filter(persnr %in% gsoep_pers_subs)

gsoep_out <- gsoep_codebook %>% 
  select(-new_itemname) %>%
  filter(category == "out") %>%
  left_join(gsoep_long %>% filter(persnr %in% gsoep_pers_subs)) 

# recode
gsoep_out <- gsoep_out %>%
  select(name, itemname, year, reverse_code:comp_rule,  persnr, recode, value) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data) 

# composite within years 
# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    group_by(persnr, name, year) %>% # group by person and item (collapse across age)
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}


gsoep_out <- gsoep_out %>%
  # filter(year <= max(gsoep_waves$Used)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(comp_rule == "select", "skip", comp_rule),
         data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

# composite across years
comp_fun <- function(p_year){
  gsoep_out %>%
    mutate(group = ifelse(year > p_year, "future", "past")) %>%
    group_by(persnr, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(persnr, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

gsoep_out <- tibble(p_year = gsoep_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) %>%
  filter(name != "mortality") %>%
  full_join(exit)
## # A tibble: 4,008,294 x 6
##    p_year   SID name         future  past value
##     <dbl> <dbl> <chr>         <dbl> <dbl> <dbl>
##  1   2005   201 chldbrth          0     0     0
##  2   2005   201 chldmvout         0     0     0
##  3   2005   201 divorced          0     0     0
##  4   2005   201 edu              NA    NA    NA
##  5   2005   201 frstJob           0     0     0
##  6   2005   201 married           0     0     0
##  7   2005   201 mntlhlthevnt      0     0     0
##  8   2005   201 mvInPrtnr         0     0     0
##  9   2005   201 physhlthevnt      0     1    NA
## 10   2005   201 retired           1     1    NA
## # … with 4,008,284 more rows

2.4.5 Covariates

recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

old.names <- unique(gsoep_codebook$orig_itemname)
gsoep_bp <- sprintf("%s/data/gsoep/bioparen.sav", wd) %>% 
  haven::read_sav(.) %>%
  select(one_of(old.names), -hhnr) %>%
  gather(key = orig_itemname, value = value, -persnr, -fnr, -mnr, na.rm = T) %>%
  left_join(gsoep_codebook) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data), recode_fun)) %>%
  unnest(data) %>%
  filter(!is.na(value)) %>%
  group_by(persnr, name, fnr, mnr) %>%
  summarize(value = fun_call(value, comp_rule)) %>% 
  ungroup() %>%
  spread(name, value) %>%
  mutate_at(vars(fnr, mnr), ~ifelse((.) < 0, NA, .)) %>%
  rename(dadID = fnr, momID = mnr)

# parental divorce 
gsoep_match <- gsoep_out %>% 
  filter(name == "divorced") %>%
  rename(momID = persnr) %>%
  right_join(gsoep_bp %>% select(momID, persnr)) %>%
  filter(!is.na(past)) %>%
  mutate(itemname = "momDivorced", name = "parDivorce") %>%
  select(p_year, persnr, name, value = past, itemname) %>%
  full_join(
gsoep_out %>% 
  filter(name == "divorced") %>%
  rename(dadID = persnr) %>%
  right_join(gsoep_bp %>% select(dadID, persnr)) %>%
  filter(!is.na(past)) %>%
  mutate(itemname = "dadDivorced", name = "parDivorce") %>%
  select(p_year, persnr, name, value = past, itemname)
  ) %>%
  group_by(p_year, persnr, name) %>%
  summarize(parDivorce = max(value, na.rm = T)) %>% 
  ungroup() %>%
  select(-name) %>%
  right_join(gsoep_match)

gsoep_match <- gsoep_pers %>%
  spread(name, value) %>% 
  full_join(gsoep_match %>% rename(year = p_year)) %>%
  select(-momOccPrstg, -dadOccPrstg) %>%
  full_join(gsoep_bp %>% select(-momID, -dadID)) %>%
  # physical health event
  left_join(gsoep_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, persnr, physhlthevnt = past) %>% distinct() %>% group_by(persnr, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
  distinct()

gsoep_out <- gsoep_out %>%
  group_by(persnr, p_year, name) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  distinct() %>%
  mutate(value = ifelse(is.infinite(value), NA, value))

gsoep_match <- gsoep_match %>%
  group_by(persnr, year) %>%
  mutate(BMI = ifelse(!is.na(weight) & !is.na(height), weight/(height/100)^2, NA)) %>%
  ungroup()

gsoep_SCA <- gsoep_match %>% 
  select(persnr, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
  group_by(persnr) %>%
  mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
         parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
  ungroup() %>% 
  select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)

unique(specifications$name)[!unique(specifications$name) %in% colnames(gsoep_match)]
## # A tibble: 346,921 x 18
##      SID p_year   age education gender grsWages physhlthevnt SRhealth smokes alcohol exercise   BMI married numKids PhysFunc religion parEdu parOccPrstg
##    <dbl>  <dbl> <dbl>     <dbl>  <dbl>    <dbl>        <dbl>    <dbl>  <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>    <dbl>    <dbl>  <dbl>       <dbl>
##  1   201   1994    58        NA      1    2979.            1      2.5     NA      NA     1       NA       0       0        0        1      0          30
##  2   203   1994    24         2     NA    5054.            1      4.5     NA      NA     4       NA       0       0        0        0      0          49
##  3   204   1994    NA        NA     NA    2599.            0     NA       NA      NA    NA       NA       0       0       NA       NA     NA          NA
##  4   601   1994    30        NA      0   31259.            1      4       NA      NA     3.86    NA       1       2        0        1      0          50
##  5   602   1994    26         1      1   35436.            1      4       NA      NA     2.86    NA       1       1        0        1      0          46
##  6   603   1994    43        NA     NA   66717.            0     NA       NA      NA    NA       NA       0       1       NA       NA     NA          NA
##  7   604   1994     1        NA     NA   69202.            0     NA       NA      NA    NA       NA       0       1       NA       NA      1          38
##  8   901   1994    33        NA      1   13774.            1      3.5     NA      NA     3.71    NA       0       0        0        1      0          34
##  9  1101   1994    78        NA      1    1940.            1      3       NA      NA     1       NA       0       0        0        1      0          38
## 10  1202   1994    71        NA      1    2989.            1      3       NA      NA     1       NA       1       0        0        1      0          29
## # … with 346,911 more rows

2.4.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2} else{F}}
not_all_na <- function(x) any(!is.na(x))
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
  }
gsoep_match_imp <- gsoep_match %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(SID) %>%
  mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
         parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
  ungroup() %>% 
  select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)

load_fun <- function(year){
  load(sprintf("~/Downloads/GSOEP_%s.RData", year))
  m
}

gsoep_match_imp <- tibble(year = c(1994, 2002, 2005, 2006, 2007),
                          imp = map(year, load_fun)) 

gsoep_match_imp <- gsoep_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% mutate(nbhdQual = ifelse(is.factor(nbhdQual), as.numeric(as.character(nbhdQual)), nbhdQual))),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

gsoep_match_imp_long <- gsoep_match_imp %>%
  select(year, imp_df) %>%
  unnest(imp_df) %>%
  mutate(alcohol = factor(ifelse(alcohol > 3, 0, 1)))

gsoep_SCA_imp <- gsoep_match_imp %>%
  select(p_year = year, imp_sca) %>%
  unnest(imp_sca) %>%
  mutate(alcohol = ifelse(alcohol > 3, 0, 1)) %>%
  full_join(gsoep_SCA %>% select(p_year, SID, BMI))
unique(specifications$name)[!unique(specifications$name) %in% colnames(gsoep_SCA_imp)]

gsoep_match_imp_long <- gsoep_match_imp_long %>%
  select(SID, age) %>%
  distinct() %>%
  group_by(SID) %>% 
  summarize(age = Mode(age))  %>% 
  full_join(gsoep_match_imp_long %>% select(-age)) 

gsoep_SCA_imp <- gsoep_SCA_imp %>%
  select(SID, smokes, alcohol, BMI) %>%
  mutate_at(vars(smokes), ~as.numeric(as.character(.))) %>%
  distinct() %>%
  gather(item, value, -SID, na.rm = T) %>%
  group_by(SID, item) %>%
  summarize(value = max(value)) %>%
  ungroup() %>%
  spread(item, value) %>%
  mutate_at(vars(alcohol, smokes), factor) %>%
  full_join(gsoep_SCA_imp %>% select(-smokes, -alcohol, -BMI)) %>%
  filter(!is.na(age)) %>%
  select(-age) %>%
  full_join(gsoep_match_imp_long %>% select(SID, p_year = year, age) %>% distinct())

gsoep_match_imp_long <- gsoep_match_imp_long %>% rename(p_year = year)

2.5 HILDA

The Household Income and Labour Dynamics in Australia (HILDA; Wilkins, Laß, Butterworth, & Vera-Toscano, 2019) study is an ongoing longitudinal study of Australian households. These data are available through application from https://melbourneinstitute.unimelb.edu.au/hilda/for-data-users.

Participants were recruited from more than 17,000 individuals. Data have been collected annually since 2001. The latest data release includes 17 waves of data from 2001 to 2017. More documentation can be found in the HILDA data dictionary at https://www.online.fbe.unimelb.edu.au/HILDAodd/srchSubjectAreas.aspx.

Sample sizes vary by year, ranging from 12,408 (2004) to 17,693 (2016). This provides 99% power to detect a zero-order correlation effect size of ~.03, two tailed at alpha .05.

2.5.1 Load Data

## # A tibble: 1,692 x 17
##    study dataset category name   itemname wave_letter  ...7  year new_itemname orig_itemname description scale  reverse_code recode  mini  maxi comp_rule
##    <chr> <lgl>   <chr>    <chr>  <chr>    <chr>       <dbl> <dbl> <chr>        <chr>         <chr>       <chr>  <chr>        <chr>  <dbl> <dbl> <chr>    
##  1 hilda NA      match    alcoh… alcohol  <NA>           NA  2001 hilda__matc… alsdrink      How often … 1=nev… <NA>         ifels…    NA    NA max      
##  2 hilda NA      match    alcoh… alcohol  <NA>           NA  2002 hilda__matc… blsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
##  3 hilda NA      match    alcoh… alcohol  <NA>           NA  2003 hilda__matc… clsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
##  4 hilda NA      match    alcoh… alcohol  <NA>           NA  2004 hilda__matc… dlsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
##  5 hilda NA      match    alcoh… alcohol  <NA>           NA  2005 hilda__matc… elsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
##  6 hilda NA      match    alcoh… alcohol  <NA>           NA  2006 hilda__matc… flsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
##  7 hilda NA      match    alcoh… alcohol  <NA>           NA  2007 hilda__matc… glsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
##  8 hilda NA      match    alcoh… alcohol  <NA>           NA  2008 hilda__matc… hlsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
##  9 hilda NA      match    alcoh… alcohol  <NA>           NA  2009 hilda__matc… ilsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
## 10 hilda NA      match    alcoh… alcohol  <NA>           NA  2010 hilda__matc… jlsdrkf       Drink alco… 1=nev… <NA>         ifels…    NA    NA max      
## # … with 1,682 more rows

2.5.2 Matching Variables

rename_fun <- function(df, Year){
  df <- df %>%
    haven::zap_labels(.) %>%
    gather(key = orig_itemname, value = value, -xwaveid, na.rm=T) %>%
    mutate(value=as.numeric(value))
  hilda_codebook %>% filter(category == "match" & year == Year) %>%
    full_join(df)
}

# rename variables   
hilda_match <- hilda %>%
  mutate(data = map2(data, year, rename_fun)) %>% 
  select(-year, -wave_letter) %>%
  unnest(data) %>%
  distinct() %>%
  rename(SID = xwaveid)

yrBrth <- hilda_match %>% 
  filter(name == "yearBrth") %>% 
  group_by(SID) %>% 
  summarize(yearBrth = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth))

# recode 
recode_fun <- function(rule, y, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

hilda_match <- hilda_match %>%
  filter(name != "yearBrth") %>%
  left_join(yrBrth) %>%
  group_by(recode, year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

# reverse code 
hilda_match <- hilda_match %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    group_by(SID, yearBrth, name, year) %>% # group by person and item (collapse across age)
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

hilda_waves <- p_waves %>% filter(Study == "HILDA") %>% select(Used) %>% distinct()

hilda_match <- hilda_match %>%
  filter(year <= max(hilda_waves$Used)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  hilda_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

hilda_match <- crossing(
  p_year = hilda_waves$Used, 
  comp_rule = unique(hilda_match$comp_rule)
  ) %>%
  mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  pivot_wider(names_from = name, values_from = value, names_repair = "unique") 
## # A tibble: 82,104 x 58
##    p_year    SID     A     C   DEP     E    IQ   LOC     N  `NA`     O    PA    SE    SS   SWL yearBrth compareHealth grsWages HHsize hlthDecline
##     <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>         <dbl>    <dbl>  <dbl>       <dbl>
##  1   2003 100001    NA    NA     0    NA    NA  6       NA  1.33    NA   5      NA   4.3    10     1951          3.67   27180       2        4.33
##  2   2003 100002    NA    NA     1    NA    NA  2.71    NA  2       NA   4      NA   5       7     1952          3.17   27180       2        1.67
##  3   2003 100003    NA    NA     0    NA    NA  5.71    NA  2       NA   4.5    NA   4.2     7     1952          3.5    14207.      5        2   
##  4   2003 100004    NA    NA     0    NA    NA  4.43    NA  2.67    NA   4.5    NA   5.3     9     1962          3.67   14207.      5        2.67
##  5   2003 100005    NA    NA     0    NA    NA  7       NA  1       NA   5      NA   7       8     1985          4      14207.      5        1   
##  6   2003 100006    NA    NA     1    NA    NA  5       NA  3       NA   5.5    NA   4.2     9     1987          4.25   14207.      5        3   
##  7   2003 100008    NA    NA    NA    NA    NA NA       NA NA       NA  NA      NA  NA       7     1925          3.5        0       2        3   
##  8   2003 100009    NA    NA    NA    NA    NA NA       NA NA       NA  NA      NA  NA       4     1925          2          0       2        5   
##  9   2003 100010    NA    NA     0    NA    NA  5.57    NA  1.33    NA   5      NA   5.9     6     1964          4      25000       1        2   
## 10   2003 100011    NA    NA     1    NA    NA  6.14    NA  2.33    NA   4      NA   6.5     6     1969          3.67       0       3        3   
## # … with 82,094 more rows, and 38 more variables: loneliness <dbl>, nbhdQual <dbl>, PhysFunc <dbl>, satFam <dbl>, satFinances <dbl>, satHH <dbl>,
## #   sickEasy <dbl>, SRhealth <dbl>, alcohol <dbl>, dadOccPrstg <dbl>, dadOcc <dbl>, education <dbl>, employed <dbl>, exercise <dbl>, married <dbl>,
## #   numKids <dbl>, urban <dbl>, welfare <dbl>, momOccPrstg <dbl>, momOcc <dbl>, disability <dbl>, ageMarried <dbl>, gender <dbl>, hospital <dbl>,
## #   visitDr <dbl>, weight <dbl>, height <dbl>, numSiblng <dbl>, dadAlive <dbl>, momAlive <dbl>, physhlthevnt <dbl>, age <dbl>, BMI <dbl>, dadEdu <dbl>,
## #   momEdu <dbl>, parDivorce <dbl>, religion <dbl>, smokes <dbl>

2.5.3 Personality Variables

hilda <- reduce(hilda$data, full_join) %>% haven::zap_labels(.)

hilda_long <- hilda %>%
  mutate_all(as.numeric) %>%
  pivot_longer(-xwaveid, names_to = "orig_itemname", values_to = "value", values_drop_na = TRUE) %>%
  rename(SID = xwaveid)

# keep correct personality waves
hilda_pers <- hilda_codebook %>% 
  select(-new_itemname) %>%
  filter(category == "pers") %>%
  left_join(hilda_long) %>%
  left_join(p_waves %>% filter(Study == "HILDA") %>% select(name = p_item, Used)) %>%
  filter(year %in% Used) 

recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# recode 
hilda_pers <- hilda_pers %>%
  select(name, itemname, year, reverse_code:comp_rule, SID, value) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data)

# reverse code 
hilda_pers <- hilda_pers %>% distinct() %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi)))

# alpha's
hilda_alpha <- hilda_pers %>%
  select(name, itemname, year, SID, value) %>%
  group_by(name, year) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

# create composites
hilda_pers <- hilda_pers %>%
  group_by(name, comp_rule, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "average", comp_rule),
         data = map2(data, comp_rule, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))

2.5.4 Outcome Variables

hilda_out <- hilda_codebook %>% 
  select(-new_itemname) %>%
  filter(category == "out" & year != "0") %>%
  left_join(hilda_long) 

old.names <- unique((hilda_codebook %>% filter(category == "out" & year == "0"))$orig_itemname)
hilda_out_stp <- sprintf("%s/data/hilda/Master r180c.sav", wd) %>% 
  read_sav() %>%
  zap_labels() %>%
  select(SID = xwaveid, one_of(old.names)) %>%
  gather(orig_itemname, value, one_of(old.names)) %>%
  full_join(crossing(orig_itemname = "yodeath", year = hilda_waves$Used)) %>%
  left_join(hilda_codebook %>% select(-new_itemname, -year))

# recode
hilda_out <- hilda_out %>%
  select(name, itemname, year, reverse_code:comp_rule, SID, value) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data) 

recode_fun <- function(rule, y, p_year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

hilda_out_stp <- hilda_out_stp %>% 
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data) 
  

# composite within years 
# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    group_by(SID, name, year) %>% # group by person and item (collapse across age)
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

hilda_waves <- p_waves %>% filter(Study == "HILDA") %>% select(Used) %>% distinct()

hilda_out <- hilda_out %>%
  # filter(year <= max(hilda_waves$Used)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "max", comp_rule),
         data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value), NA, value))

# composite across years
comp_fun <- function(p_year){
  hilda_out %>%
    mutate(group = ifelse(year > p_year, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

hilda_out <- tibble(p_year = hilda_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) %>%
  full_join(hilda_out_stp %>% select(p_year = year, SID, name, value) %>% mutate(SID = as.numeric(SID)))

2.5.5 Covariates

hilda_match <- hilda_pers %>%
  select(-comp_rule) %>%
  spread(name, value) %>% 
  rename(p_year = year) %>%
  full_join(hilda_match %>% mutate(SID = as.numeric(SID))) %>%
  # physical health event
  left_join(hilda_out %>% filter(name == "physhlthevnt") %>% select(p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, p_year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
  distinct()

hilda_match <- hilda_match %>%
  group_by(SID, p_year) %>%
  mutate(age = p_year - yearBrth) %>%
  ungroup() %>%
  rename(momOccPrstg = momJobPrstg, dadOccPrstg = dadJobPrstg) 

hilda_dem <- hilda_match %>% 
  select(SID, religion, smokes, parDivorce, BMI, momEdu, dadEdu) %>%
  gather(key= item, value = value, -SID, na.rm = T) %>%
  group_by(SID, item) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  spread(item, value) %>%
  mutate(parDivorce = ifelse(is.na(parDivorce), 0, parDivorce))

hilda_match <- hilda_match %>% 
  select(-religion, -smokes, -parDivorce, -BMI, -momEdu, -dadEdu) %>%
  full_join(hilda_dem)
  

hilda_out <- hilda_out %>%
  select(-past, -future) %>%
  distinct()

hilda_SCA <- hilda_match %>% 
  select(SID, p_year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
  group_by(SID) %>%
  mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
         parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
  ungroup() %>% 
  select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)

unique(specifications$name)[!unique(specifications$name) %in% colnames(hilda_SCA_imp)]

2.5.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  mice(df, m = 5, maxit=5, printFlag=TRUE)
  }
hilda_match_imp <- hilda_match %>%
  rename(NegAff = `NA`) %>%
  group_by(SID, year) %>%
  mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  select(-momOccPrstg, -dadOccPrstg) %>%
  ungroup() %>%
  mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)

hilda_match_imp <- hilda_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% mutate(PhysFunc = factor(ifelse(PhysFunc == 1, 0, 1)))),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

hilda_match_imp_long <- hilda_match_imp %>%
  select(p_year, imp_df) %>%
  unnest(imp_df)

hilda_SCA_imp <- hilda_match_imp %>%
  select(p_year, imp_sca) %>%
  unnest(imp_sca) %>%
  left_join(hilda_SCA %>% select(p_year, SID,
        colnames(hilda_SCA)[!colnames(hilda_SCA) %in% colnames(.)])) %>%
  mutate(race = factor(0))

unique(specifications$name)[!unique(specifications$name) %in% colnames(hilda_SCA_imp)]

2.6 HRS

The Health and Retirement Study (HRS; Juster & Suzman, 1995) is an ongoing longitudinal study of households in the United States. These data are available at https://hrs.isr.umich.edu by creating a free account.

Participants were recruited from more than 35,000 individuals from the financial households of individuals born between 1931 and 1941 in the US. Data have been collected biannually since 1992. The latest data release includes data up to 2016. On average, 10,000 individuals are sampled each wave More information on the HRS can be found at https://hrs.isr.umich.edu/documentation/survey-design, but, in short, the HRS is a nationally representative sample of adults over 50 in the US. It is critical to note that the HRS samples households of the original cohort and follows individuals and their spouses or partners until their death.

Sample size varies by year, ranging from approximately 7,500 (2014) to 15,500 (1992). (https://hrs.isr.umich.edu/sites/default/files/biblio/ResponseRates_2017.pdf). This provides 99/% power to detect a zero-order correlation effect size of ~.04, two-tailed at alpha .05.

2.6.1 Load Data

hrs_read_fun <- function(year) {
  read_da <- function(da, dct, Year){
    print(paste(da, dct, year, sep = " "))
    data.file <- sprintf("%s/data/hrs/%s/%s", wd, Year, da)
    # Set path to the dictionary file "*.DCT"
    dict.file <- sprintf("%s/data/hrs/%s/%s", wd, Year, dct)
    # Read the dictionary file
    df.dict <- read.table(dict.file, skip = 1, fill = TRUE, stringsAsFactors = FALSE)
    # Set column names for dictionary dataframe
    colnames(df.dict) <- c("col.num","col.type","col.name","col.width","col.lbl")
    # Remove last row which only contains a closing }
    row <- which(df.dict$col.name == "HHID")
    df.dict <- df.dict[-nrow(df.dict),]
    if(row == 2){df.dict <- df.dict[-1,]}
    # Extract numeric value from column width field
    df.dict$col.width <- as.integer(sapply(df.dict$col.width, gsub, pattern = "[^0-9\\.]", replacement = ""))
    # Convert column types to format to be used with read_fwf function
    df.dict$col.type <- sapply(df.dict$col.type, function(x) ifelse(x %in% c("int","byte","long"), "i", ifelse(x == "float", "n", ifelse(x == "double", "d", "c"))))
    # Read the data file into a dataframe
    df <- read_fwf(file = data.file, fwf_widths(widths = df.dict$col.width, col_names = df.dict$col.name), col_types = paste(df.dict$col.type, collapse = ""))
    # Add column labels to headers
    attributes(df)$variable.labels <- df.dict$col.lbl
    old.names <- (hrs_codebook %>% filter(year == Year))$orig_itemname
    if(any(c("PN", "HHID") %in% colnames(df)) & any(old.names %in% colnames(df))){
    # if(any(c("PN", "HHID") %in% colnames(df))){
      df <- df %>%
      mutate(hhidpn = 1000*as.numeric(HHID) + as.numeric(PN)) %>%
      select(one_of(c("PN", "HHID")), one_of(old.names)) %>%
        distinct()
      # gather(key = item, value = value, -hhidpn)
    } else {df <- NA}
    return(df)
  }
    # Set path to the data file "*.DA"
  files <- list.files(sprintf("%s/data/hrs/%s", wd, year))
  df2 <- tibble(
    da = files[grepl(".da",  files) | grepl(".DA", files)],
    dct = files[grepl(".dct", files) | grepl(".DCT", files)]
  ) %>%
    mutate(data = map2(da, dct, possibly(~read_da(.x, .y, year), NA_real_))) %>%
    filter(!is.na(data)) %>%
    select(-da, -dct) 
  if(nrow(df2) != 0){df2$data %>% reduce(full_join) %>% distinct()} else {NA}
}
## # A tibble: 2,293 x 19
##    study dataset category name  itemname  wave waveletter  year new_itemname orig_itemname description scale reverse_code recode  mini  maxi comp_rule
##    <chr> <chr>   <chr>    <chr> <chr>    <dbl> <chr>      <dbl> <chr>        <chr>         <chr>       <chr> <chr>        <chr>  <dbl> <dbl> <chr>    
##  1 hrs   <NA>    match    ageM… yearMar…     1 a           1992 hrs__match_… V241          Year First… "191… <NA>         ifels…    NA    NA min      
##  2 hrs   <NA>    match    ageM… yearMar…     2 b           1993 hrs__match_… V156          Year First… "191… <NA>         ifels…    NA    NA min      
##  3 hrs   <NA>    match    ageM… yearMar…     3 c           1994 hrs__match_… A3-V1         Year First… "191… <NA>         ifels…    NA    NA min      
##  4 hrs   <NA>    match    ageM… yearMar…     4 d           1995 hrs__match_… D678          Year First… "191… <NA>         ifels…    NA    NA min      
##  5 hrs   <NA>    match    ageM… yearMar…     5 e           1996 hrs__match_… E678          Year First… "191… <NA>         ifels…    NA    NA min      
##  6 hrs   <NA>    match    ageM… yearMar…     6 f           1998 hrs__match_… F1073         Year First… "191… <NA>         ifels…    NA    NA min      
##  7 hrs   <NA>    match    ageM… yearMar…     7 g           2000 hrs__match_… GB066_1       Year First… "191… <NA>         ifels…    NA    NA min      
##  8 hrs   <NA>    match    ageM… yearMar…     8 h           2002 hrs__match_… HB066_1       Year First… "191… <NA>         ifels…    NA    NA min      
##  9 hrs   <NA>    match    ageM… yearMar…     9 i           2004 hrs__match_… IB066_1       Year First… "191… <NA>         ifels…    NA    NA min      
## 10 hrs   <NA>    match    ageM… yearMar…    10 j           2006 hrs__match_… JB066_1       Year First… "191… <NA>         ifels…    NA    NA min      
## # … with 2,283 more rows, and 2 more variables: ...18 <lgl>, ...19 <chr>

2.6.2 Matching Variables

hrs_waves <- p_waves %>% filter(Study == "HRS") %>% select(Used) %>% distinct()

# rename variables   
hrs_match <- hrs_codebook %>%
  filter(year <= max(hrs_waves$Used)) %>%
  filter(grepl("match", category)) %>%
  select(name, itemname, wave, year, orig_itemname, reverse_code:comp_rule) %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(hrs_long))) 

yrBrth <- hrs_match %>%
  filter(name == "yearBrth") %>%
  unnest(data) %>%
  mutate(yearBrth = value) %>%
  select(SID, yearBrth)

# recode 
recode_fun <- function(rule, y, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

hrs_match <- hrs_match %>% 
  filter(name != "yearBrth") %>% 
  mutate(data = map(data, ~(.) %>% left_join(yrBrth) %>%
    group_by(recode, year) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
    unnest(data) %>%
    mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))

# reverse code 
hrs_match <- hrs_match %>%
  mutate(data = map(data, ~(.) %>%
    mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, yearBrth, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

hrs_match <- hrs_match %>%
  unnest(data) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  hrs_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    distinct()
}

hrs_match <- crossing(
  p_year = hrs_waves$Used, 
  comp_rule = unique(hrs_match$comp_rule)
  ) %>%
  mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>% 
  distinct() %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  spread(name, value) %>%
  filter(!is.na(SID))

2.6.3 Personality Variables

# keep correct personality waves
hrs_pers <- hrs_codebook %>% mutate(reverse_code = tolower(reverse_code)) %>%
  select(-new_itemname) %>%
  filter(category == "pers") %>%
  left_join(hrs_long) %>%
  mutate(year = mapvalues(year, seq(2006, 2016, 2), rep(c(2006, 2010, 2014), each = 2))) %>%
  left_join(p_waves %>% filter(Study == "HRS") %>% select(name = p_item, Used)) %>%
  filter(year %in% Used) %>%
  distinct()

recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# recode 
hrs_pers <- hrs_pers %>%
  select(name, itemname, year, reverse_code:comp_rule, SID, Used, value) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data)

# reverse code 
hrs_pers <- hrs_pers %>% distinct() %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, reverse.code(-1, value, mini = mini, maxi = maxi))) %>%
  group_by(name, itemname, year, SID, comp_rule) %>%
  summarize(value = mean(value, na.rm = T))

# alpha's
hrs_alpha <- hrs_pers %>%
  filter(!is.na(value)) %>%
  select(name, itemname, year, SID, value) %>%
  group_by(name, year) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% distinct() %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(mean))),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

# create composites
hrs_pers <- hrs_pers %>%
  group_by(name, comp_rule, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, comp_fun)) %>%
  unnest()
## # A tibble: 314,336 x 5
##    name   year comp_rule      SID value
##    <chr> <dbl> <chr>        <dbl> <dbl>
##  1 A      2006 average       3010   3  
##  2 A      2006 average       3020   3  
##  3 A      2006 average   10001010   4  
##  4 A      2006 average   10003030   3.4
##  5 A      2006 average   10004010   3.4
##  6 A      2006 average   10004040   3.6
##  7 A      2006 average   10013010   2  
##  8 A      2006 average   10013040   2.6
##  9 A      2006 average   10038010   3.4
## 10 A      2006 average   10038040   2.6
## # … with 314,326 more rows

2.6.4 Outcome Variables

hrs_out <- hrs_codebook %>% 
  select(-new_itemname) %>%
  filter(category == "out" & year != 0) %>%
  left_join(hrs_long) %>%
  distinct() %>%
  full_join(crossing(p_year = hrs_waves$Used, name = unique((.)$name)))

hrs_out_stp <- hrs_codebook %>% 
  select(-new_itemname, -description, scale) %>%
  filter(category == "out" & year == 0) %>%
  left_join(hrs_long) %>%
  full_join(tibble(name = "mortality", SID = unique(hrs_long$SID),
                   recode = unique(.$recode))) %>%
  full_join(crossing(p_year = hrs_waves$Used, name = unique((.)$name)))

# recode
recode_fun <- function(rule, y, p_year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

hrs_out <- hrs_out %>%
  select(name, itemname, year, reverse_code:comp_rule, value, SID, p_year) %>%
  group_by(recode, p_year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) 

hrs_out_stp <- hrs_out_stp %>%
  select(name, itemname, reverse_code:comp_rule, value, SID, p_year) %>%
  mutate(value = ifelse(value > 2050 | value < 0 | is.na(value), 0, value)) %>%
  group_by(recode, p_year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) 

# composite within years 
# compositing within years
hrs_out <- hrs_out %>%
  group_by(SID, name, year, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value))

# composite across years
comp_fun <- function(p_year){
  hrs_out %>%
    mutate(group = ifelse(year > p_year, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

hrs_out <- tibble(p_year = hrs_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) %>%
  full_join(hrs_out_stp %>% select(name, p_year, value, SID)) %>%
  filter((!name %in% c("married", "edu")) & !is.na(SID)) 
## # A tibble: 774,366 x 6
##    p_year   SID name         future  past value
##     <dbl> <dbl> <chr>         <dbl> <dbl> <dbl>
##  1   2006  1010 divorced          1     1    NA
##  2   2006  1010 mntlhlthevnt      1     1    NA
##  3   2006  1010 physhlthevnt      1     1    NA
##  4   2006  1010 retired          NA     0     0
##  5   2006  1010 unemployed       NA     0     0
##  6   2006  2010 divorced          0     0     0
##  7   2006  2010 mntlhlthevnt      1     1    NA
##  8   2006  2010 physhlthevnt      1     1    NA
##  9   2006  2010 retired           1     1    NA
## 10   2006  2010 unemployed        0     0     0
## # … with 774,356 more rows

2.6.5 Covariates

hrs_match <- hrs_pers %>%
  select(-comp_rule) %>%
  spread(name, value) %>% 
  full_join(hrs_match %>% rename(year = p_year)) %>%
  # left_join(gsoep_bp %>% select(-momID, -dadID)) %>%
  # physical health event
  left_join(hrs_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
  distinct()

hrs_match <- hrs_match %>%
  group_by(SID, year) %>%
  mutate(age = year - yearBrth) %>%
  ungroup()

npb_fun <- function(l, u){
  lb <- sort(c(l,u))[1]
  ub <- sort(c(l,u))[2]
  mean((tibble(occ00 = seq(lb,ub),
         npb = mapvalues(occ00, npb$OCC00, npb$NPB, warn_missing = F)) %>%
    mutate(npb = ifelse(npb > 100, NA, npb)))$npb, na.rm = T)[1]
}

hrs_npb_groups <- 
  tribble(
    ~hrsCat, ~occ80l, ~occ80u, ~occ90l, ~occ90u,
    1, 003, 037, 001, 037,
    2, 043, 235, 043, 235,
    3, 243, 285, 243, 274,
    4, 303, 389, 303, 389,
    5, 403, 407, 403, 407,
    6, 413, 427, 417, 427,
    7, 433, 444, 433, 444,
    8, 445, 447, 445, 447,
    9, 448, 469, 448, 469,
    10, 473, 499, 473, 498,
    11, 503, 549, 503, 542,
    12, 553, 617, 553, 617,
    13, 633, 699, 628, 699,
    14, 703, 799, 703, 799,
    15, 803, 859, 803, 859, 
    16, 863, 889, 863, 889, 
    17, 900, 900, 900, 900
    ) %>%
  mutate(occ00l = mapvalues(occ90l, occ90to00$OCC90, occ90to00$OCC00),
         occ00u = mapvalues(occ90u, occ90to00$OCC90, occ90to00$OCC00)) %>%
  group_by(hrsCat) %>%
  mutate(npb = npb_fun(occ00l, occ00u))

hrs_match <- hrs_match %>% 
  select(SID, year, dadOcc) %>% 
  filter(!is.na(dadOcc)) %>%
  mutate(dadOccPrstg = ifelse(dadOcc == 98, NA, dadOcc),
         dadOccPrstg = mapvalues(dadOccPrstg, hrs_npb_groups$hrsCat, hrs_npb_groups$npb)) %>%
  select(-dadOcc, -year) %>%
  full_join(hrs_match)

hrs_out <- hrs_out %>%
  select(-past, -future) %>%
  distinct()

hrs_SCA <- hrs_match %>% 
  select(SID, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
  group_by(SID) %>%
  mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
  mutate_at(vars(parEdu), ~ifelse(is.infinite(.), NA, .)) %>%
  ungroup() %>% 
  rename(parOccPrstg = dadOccPrstg) %>%
  select(-momEdu, -dadEdu)

unique(specifications$name)[!unique(specifications$name) %in% colnames(hrs_match2)]
## # A tibble: 128,334 x 20
##       SID p_year   age education gender grsWages  race physhlthevnt SRhealth smokes alcohol exercise   BMI married numKids parDivorce PhysFunc religion
##     <dbl>  <dbl> <dbl>     <dbl>  <dbl>    <dbl> <dbl>        <dbl>    <dbl>  <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>      <dbl>    <dbl>    <dbl>
##  1 1.01e7   1992    51         0      1   34932      2            1     2         1    NA      0      48.4       1       4          0        0        1
##  2 1.01e7   1996    55         0      1   24137.     2            1     2         1     0      0.667  47.6       1       4          0        0        1
##  3 1.01e7   2006    65         0      1   20276.     2            1     1.75      1     0      1.75   44.1       1       4          0        1        1
##  4 1.03e7   1996    63         0      0   31800      0            1     4         0     2      0      37.2       1       3          0        0        2
##  5 1.03e7   1992    59         0      0      NA      0            1    NA        NA    NA     NA      NA        NA       3          0       NA        2
##  6 1.03e7   2006    73         0      0   70108.     0            1     3.8       0     1.8    1.2    35.0       1       3          0        0        2
##  7 1.04e7   1992    54         0      1  139700      0            1     5         1    NA      0      27.4       1       2          0        0        1
##  8 1.04e7   1996    58         0      1  154033.     0            1     4.33      1     7      0.667  27.4       1       2          0        0        1
##  9 1.04e7   2006    68         0      1  306668.     0            1     3.5       1     7      1.88   27.7       1       2          0        0        1
## 10 1.04e7   1992    49         1      1  116000      0            1     4         1    NA      1      26.2       1       3          0        0        2
## # … with 128,324 more rows, and 2 more variables: parOccPrstg <dbl>, parEdu <dbl>

2.6.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
sum_na <- function(x) sum(is.na(x))/length(x)*100
mice_fun <- function(df){
  s <- unique(hrs_pers$SID)
  df <- df %>% select(-ageMarried, -CurMarYears, -spouseEmployed, -weight, -numHospStays, -SRhealthChng) %>%
    filter(SID %in% s)
  pos <- apply(df, 1, sum_na) < 60
  df <- df %>% filter(pos)
  mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
}
hrs_match_imp <- hrs_match %>%
  rename(NegAff = `NA`, parOccPrstg = dadOccPrstg) %>%
  mutate_at(vars(eduMom, eduDad), ~ifelse(. >= 17 & !is.na(.), 2, ifelse(. >= 15, 1, 0))) %>%
  group_by(SID, year) %>%
  mutate(momEdu = max(cbind(momEdu, eduMom), na.rm = T),
         dadEdu = max(cbind(dadEdu, eduDad), na.rm = T)) %>%
  ungroup() %>%
  select(-eduMom, -eduDad) %>%
  mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))


hrs_match_imp <- hrs_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% 
            mutate_at(vars(exercise, PhysFunc), ~ifelse(as.numeric(as.character(.)) > 0, 1, 0)) %>%
            mutate(PhysFunc = factor(PhysFunc), 
                   satRetire = ifelse(is.factor(satRetire), as.numeric(as.character(satRetire)), satRetire))),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

hrs_match_imp_long <- hrs_match_imp %>%
  select(p_year = year, imp_df) %>%
  unnest(imp_df) %>%
  mutate(alcohol = factor(ifelse(alcohol > 0, 1, 0)))

hrs_SCA_imp <- hrs_match_imp %>%
  select(p_year = year, imp_sca) %>%
  unnest(imp_sca) %>%
  full_join(hrs_SCA %>% select(p_year = year, SID)) %>% 
  distinct() %>%
  mutate(alcohol = factor(ifelse(alcohol > 0, 1, 0)))

hrs_SCA_imp  <- hrs_SCA_imp %>%
  filter(!is.na(alcohol)) %>%
  select(SID, alcohol) %>%
  distinct() %>%
  full_join(hrs_SCA_imp %>% select(-alcohol)) %>%
  filter(!is.na(age))

unique(specifications$name)[!unique(specifications$name) %in% colnames(hrs_SCA_imp)]

2.7 LISS

The Longitudinal Studies for the Social sciences (LISS; Scherpenzeel, Das, Ester, & Kaczmirek, 2010) is an ongoing longitudinal study of households in the Netherlands. These data are online, through application, from https://statements.centerdata.nl/liss-panel-data-statement.

Participants were approximately 8,000 Dutch-speaking individuals permanently residing in the Netherlands from 5,000 households. Data have been collected annually since 2007. The latest data release includes 11 waves of data from 2008 to 2018. More documentation are available at https://www.dataarchive.lissdata.nl/study_units/view/1.

Sample sizes vary by year, ranging from 5,021 (2018) to 6808 (2008). This provides 99/% power to detect a correlation effect size of ~.04, two-tailed at alpha .05.

2.7.1 Load Data

## # A tibble: 3,688 x 18
##    study dataset category name  itemname  wave wave_letter  year new_itemname orig_itemname description scale reverse_code recode  mini  maxi comp_rule
##    <chr> <chr>   <chr>    <chr> <chr>    <dbl> <chr>       <dbl> <chr>        <chr>         <chr>       <chr> <chr>        <chr>  <dbl> <dbl> <chr>    
##  1 liss  avars_… match    age   YOB1         1 a            2008 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  2 liss  avars_… match    age   YOB1         2 b            2009 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  3 liss  avars_… match    age   YOB1         3 c            2010 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  4 liss  avars_… match    age   YOB1         4 d            2011 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  5 liss  avars_… match    age   YOB1         5 e            2012 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  6 liss  avars_… match    age   YOB1         6 f            2013 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  7 liss  avars_… match    age   YOB1         7 g            2014 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  8 liss  avars_… match    age   YOB1         8 h            2015 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
##  9 liss  avars_… match    age   YOB1         9 i            2016 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
## 10 liss  avars_… match    age   YOB1        10 j            2017 <NA>         gebjaar       Year of Bi… nume… <NA>         year …    NA    NA mode     
## # … with 3,678 more rows, and 1 more variable: LHQ <chr>

2.7.2 Matching Variables

rename_fun <- function(cb, var){
  old.names <- unique((liss_codebook %>% filter(name == var))$orig_itemname)
  df <- liss %>% 
    select(SID = nomem_encr, HHID = nohouse_encr, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, 
           -SID, -HHID, na.rm=T)
  if(length(old.names) > 1){
      df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
  } else {
    df %>% left_join(cb %>% select(-(itemname:year),  -new_itemname, -dataset) %>% distinct()) %>% mutate(year = 0)
  }
}

# rename variables   
liss_match <- liss_codebook %>%
  filter(category == "match") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun)) %>% 
  unnest(data) %>%
  distinct()

# bring in year or birth for cleaning
liss_match <- liss_match %>% 
  filter(name != "age") %>% 
  left_join(
  liss_match %>% filter(name == "age") %>% 
  mutate(yearBrth = value) %>%
  select(SID, yearBrth)
  ) %>% distinct()

# recode 

recode_fun <- function(rule, y, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

liss_match <- liss_match %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

# reverse code 
liss_match <- liss_match %>%
  mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, HHID, yearBrth, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

liss_waves <- p_waves %>% filter(Study == "LISS") %>% select(Used) %>% distinct()

liss_match <- liss_match %>%
  filter(year <= max(liss_waves$Used) | 
         name %in% c("dadEdu", "momEdu", "momOccPrstg", "dadOccPrstg")) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  liss_match %>%
    filter(year <= p_year  & comp_rule == rule | 
           name %in% c("dadEdu", "momEdu", "momOccPrstg", "dadOccPrstg")) %>%
    group_by(SID, HHID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

liss_match <- crossing(
  p_year = liss_waves$Used, 
  comp_rule = unique(liss_match$comp_rule)
  ) %>%
  mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) 
## # A tibble: 27,382 x 62
##       SID   HHID  year     A     C   DEP     E    IQ     N  `NA`     O    PA    SE    SS   SWL yearBrth HHsize dadEdu dadOccPrstg momEdu momOccPrstg
##     <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>  <dbl>  <dbl>       <dbl>  <dbl>       <dbl>
##  1 800033 583404  2008  3     2    NA      2.2    NA   3.5   3.3  3.29   4.2  5.38  2     6        1991    4.5     NA          NA     NA          NA
##  2 800042 500277  2008  3.5   4     2.67   3.2    NA   3     2.1  3.43   5.7  4.88  2.08  6.17     1975    4        0           7      0           8
##  3 800045 548654  2008 NA    NA     1     NA      NA  NA    NA   NA     NA   NA    NA    NA        1942    2       NA          NA     NA          NA
##  4 800057 580532  2008  2.83  2.67  1      4.2    NA   4.5   1.4  4.57   4.5  6.75  2     6.17     1975    4        0           6      1          NA
##  5 800076 578048  2008  3.17  3.5   2.33   3       4   3     2.5  3.14   5.6  4.5   1.3   6.5      1985    3       NA          NA     NA          NA
##  6 800119 537783  2008  2.83  3.33  1.33   3.4    NA   3    NA    3.14  NA    5.75 NA     4.67     1950    4        0           9     NA          NA
##  7 800125 582101  2008  4     4     4.33   2.4    NA   2     3.3  4.29   4.3  4.5   0     3.83     1958    1.5     NA          NA     NA          NA
##  8 800134 549826  2008  4.67  2.67  1.67   4.8    NA   4     1.8  3.43   5.4  5.62 NA     5.83     1930    2       NA          NA     NA          NA
##  9 800155 545016  2008 NA    NA    NA     NA      NA  NA    NA   NA     NA   NA     1.2  NA        1955    1       NA          NA     NA          NA
## 10 800158 519049  2008  3.67  3.67  1      2.4    NA   4     1.3  3      3.2  5.5   2     5.83     1969    7       NA          NA     NA          NA
## # … with 27,372 more rows, and 41 more variables: eatingHabits <dbl>, loneliness <dbl>, physAct <dbl>, PhysFunc <dbl>, satFnc <dbl>, satLeisure <dbl>,
## #   satSchl <dbl>, satWgs <dbl>, SRhealth <dbl>, survAtt <dbl>, exercise <dbl>, satFam <dbl>, satHH <dbl>, alcohol <dbl>, alcoholType <dbl>,
## #   disability <dbl>, drugs <dbl>, drVisits <dbl>, employed <dbl>, ethnicity <dbl>, numSibling <dbl>, religion <dbl>, smokes <dbl>, unempBen <dbl>,
## #   welfare <dbl>, gender <dbl>, urban <dbl>, ageMarried <dbl>, married <dbl>, dadAlive <dbl>, diet <dbl>, height <dbl>, momAlive <dbl>,
## #   parDivorce <dbl>, weight <dbl>, grsWages <dbl>, numKids <dbl>, education <dbl>, physhlthevnt <dbl>, age <dbl>, BMI <dbl>

2.7.4 Outcome Variables

rename_fun <- function(cb, var){
  old.names <- unique((liss_codebook %>% filter(name == var))$orig_itemname)
  df <- liss %>% 
    select(SID = nomem_encr, HHID = nohouse_encr, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, 
           -SID, -HHID) %>%
    left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
}

liss_out <- liss_codebook %>%
  filter(category == "out") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun)) %>% 
  unnest(data) %>%
  distinct() %>%
  full_join(crossing(p_year = liss_waves$Used, name = unique((.)$name)))

liss_out <- avars %>% 
  select(year, itemname = month, SID = nomem_encr, divorced = burgstat, unemployed = belbezig) %>%
  gather(key = name, value = value, divorced, unemployed) %>%
  distinct() %>%
  mutate(year = as.numeric(year), p_year = 2008) %>%
  left_join(liss_codebook %>% filter(name %in% c("divorced", "unemployed")) %>%
              select(name, recode, reverse_code, mini, maxi) %>% distinct()) %>%
  full_join(liss_out %>% filter(!(orig_itemname %in% c("burgstat", "belbezig"))))

# recode
recode_fun <- function(rule, y, p_year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

liss_out <- liss_out %>%
  select(year:maxi, p_year) %>%
  # select(name:HHID, itemname, year, reverse_code:comp_rule, value, p_year) %>%
  # filter(name %in% c("married", "mvInPrtnr")) %>%
  group_by(recode, p_year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) %>% distinct()

liss_waves <- p_waves %>% filter(Study == "LISS") %>% select(Used) %>% distinct()

liss_out <- liss_out %>%
  # filter(year <= max(gsoep_waves$Used)) %>%
  group_by(SID, name, year) %>%
  summarize(value = max(value, na.rm = T)) %>% 
  ungroup() %>%
  mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))

# composite across years
comp_fun <- function(p_year){
  liss_out %>%
    mutate(group = ifelse(year > p_year, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

liss_out <- tibble(p_year = liss_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) 

2.7.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
  }

liss_match_imp <- liss_match %>%
  rename(NegAff = `NA`, race = ethnicity) %>%
  group_by(SID, year) %>%
  mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  select(-momOccPrstg, -dadOccPrstg, -HHID) %>%
  ungroup() %>%
  mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)

liss_match_imp <- liss_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% mutate(PhysFunc = factor(ceiling(PhysFunc)))),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

liss_match_imp_long <- liss_match_imp %>%
  select(p_year = year, imp_df) %>%
  unnest(imp_df)

liss_SCA_imp <- liss_match_imp %>%
  select(p_year = year, imp_sca) %>%
  unnest(imp_sca) %>%
  left_join(liss_SCA %>% select(p_year=year, SID,
        colnames(liss_SCA)[!colnames(liss_SCA) %in% colnames(.)]))

2.8 MIDUS

The Midlife in the United States (MIDUS; Brim, Ryff, & Kessler, 2004; Ryff et al., 2012, 2016) study is an ongoing longitudinal study of adults in the United States. These data are available at http://www.icpsr.umich.edu by making a free account.

Participants included more than 10,000 individuals aged 25 or older from the United States. The present study uses data from MIDUS I, II, and III. MIDUS I was collected in 1995-1996. MIDUS II was the follow-up to MIDUS I and was collected from 2004-2006. MIDUS III was an additional follow-up conducted from 2013-2014. More information can be found at http://midus.wisc.edu/findings/Understanding_Data_Collection_in_MIDUS.pdf.

Sample size varies by wave, with 7,108 (MIDUS I), 4,963 (MIDUS II), 3,294 (MIDUS III). This provides 99/% power to detect a zero-order correlation effect size of ~.06, two-tailed at alpha .05.

2.8.1 Load Data

## # A tibble: 1,051 x 16
##    study dataset category name   itemname  wave   year new_itemname  orig_itemname description    scale        reverse_code recode   mini  maxi comp_rule
##    <chr> <chr>   <chr>    <chr>  <chr>     <chr> <dbl> <chr>         <chr>         <chr>          <chr>        <chr>        <chr>   <dbl> <dbl> <chr>    
##  1 midus M2P1    match    achvm… hghStndr… M2P1   2004 midus__match… B1SE7FF       Set very high… "1\tTRUE OF… no           ifelse…     1     4 <NA>     
##  2 midus M3P1    match    achvm… hghStndr… M3P1   2013 midus__match… C1SE7FF       Set very high… "-1\tRESPON… no           ifelse…     1     4 <NA>     
##  3 midus M2P1    match    achvm… keepWrki… M2P1   2004 midus__match… B1SE7L        Keep working … "1\tTRUE OF… no           ifelse…     1     4 <NA>     
##  4 midus M3P1    match    achvm… keepWrki… M3P1   2013 midus__match… C1SE7L        Keep working … "-1\tRESPON… no           ifelse…     1     4 <NA>     
##  5 midus M2P1    match    achvm… likeDiff  M2P1   2004 midus__match… B1SE7O        I like to try… "1\tTRUE OF… no           ifelse…     1     4 <NA>     
##  6 midus M3P1    match    achvm… likeDiff  M3P1   2013 midus__match… C1SE7O        I like to try… "-1\tRESPON… no           ifelse…     1     4 <NA>     
##  7 midus M2P1    match    achvm… likeWrk   M2P1   2004 midus__match… B1SE7R        I like hard w… "1\tTRUE OF… no           ifelse…     1     4 <NA>     
##  8 midus M3P1    match    achvm… likeWrk   M3P1   2013 midus__match… C1SE7R        I like hard w… "-1\tRESPON… no           ifelse…     1     4 <NA>     
##  9 midus M1P1    match    age    age       M1P1   1994 midus__match… A1PAGE_M2     M1 age comput… "numeric"    <NA>         <NA>       NA    NA <NA>     
## 10 midus M2P1    match    age    age       M2P1   2004 midus__match… B1PAGE_M2     M1 age comput… "numeric"    <NA>         <NA>       NA    NA <NA>     
## # … with 1,041 more rows

2.8.2 Matching Variables

rename_fun <- function(cb, var){
  old.names <- unique((midus_codebook %>% filter(name == var))$orig_itemname)
  df <- midus %>% 
    select(SID = M2ID, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -SID) %>%
    mutate(value = as.numeric(value))
  if(length(old.names) > 1){
      df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
  } else {
    df %>% left_join(cb %>% select(-(itemname:year),  -new_itemname, -dataset) %>% distinct()) %>% mutate(year = 0)
  }
}

# rename variables   
midus_match <- midus_codebook %>%
  filter(category == "match") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun)) %>% 
  unnest(data) %>%
  distinct()

# bring in year or birth for cleaning
yrBrth <- midus_match %>% 
  filter(name == "age") %>% 
  mutate(yearBrth = year - value) %>%
  group_by(SID) %>%
  summarize(yearBrth = max(yearBrth, na.rm = T)) %>%
  ungroup() %>%
  mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth))

# recode 

recode_fun <- function(rule, y, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

midus_match <- midus_match %>%
  left_join(yrBrth) %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

# reverse code 
midus_match <- midus_match %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, yearBrth, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

midus_waves <- p_waves %>% filter(Study == "MIDUS") %>% select(Used) %>% distinct()

midus_match <- midus_match %>%
  filter(year <= max(midus_waves$Used)) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  midus_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

midus_match <- crossing(
  p_year = midus_waves$Used, 
  comp_rule = unique(midus_match$comp_rule)
  ) %>%
  mutate(data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) 

2.8.3 Personality Variables

## # A tibble: 118,712 x 4
## # Groups:   SID, name [78,545]
##      SID name   year value
##    <dbl> <chr> <dbl> <dbl>
##  1 10001 A      1994  3   
##  2 10001 A      2004  2.6 
##  3 10001 C      1994  2.75
##  4 10001 C      2004  3.25
##  5 10001 E      1994  2.8 
##  6 10001 E      2004  2.8 
##  7 10001 LOC    1994  5.5 
##  8 10001 LOC    2004  4.75
##  9 10001 N      1994  2   
## 10 10001 N      2004  2.25
## # … with 118,702 more rows

2.8.4 Outcome Variables

rename_fun <- function(cb, var){
  old.names <- unique((midus_codebook %>% filter(name == var))$orig_itemname)
  df <- midus %>% 
    select(SID = M2ID, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -SID) %>%
    left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
}

midus_out <- midus_codebook %>%
  filter(category == "out") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun)) %>% 
  unnest(data) %>%
  distinct() %>%
  full_join(crossing(p_year = midus_waves$Used, name = unique((.)$name)))

# recode
recode_fun <- function(rule, y, p_year, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

midus_out <- midus_out %>%
  select(name, SID, value:year, recode, p_year) %>%
  left_join(yrBrth) %>%
  group_by(recode, p_year, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
  unnest(data) %>% distinct()

midus_out <- midus_out %>% filter(name != "mvInPrtnr") %>%
  select(-recode) %>%
  full_join(
    midus_out %>% filter(name %in% c("mvInPrtnr", "married")) %>%
      group_by(p_year, name, SID, year) %>%
      summarize(value = max(value, na.rm = T)) %>%
      ungroup() %>%
      mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value)) %>%
      spread(name, value) %>%
      mutate(name = "mvInPrtnr",
             value = ifelse((married == 0 | is.na(married)) & mvInPrtnr == 1 
                            & !is.na(mvInPrtnr), 1, 0)) %>%
      select(-married, -mvInPrtnr)
  )

midus_waves <- p_waves %>% filter(Study == "MIDUS") %>% select(Used) %>% distinct()

midus_out <- midus_out %>%
  # filter(year <= max(gsoep_waves$Used)) %>%
  group_by(SID, name, year) %>%
  summarize(value = max(value, na.rm = T)) %>% 
  ungroup() %>%
  mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))

# composite across years
comp_fun <- function(p_year){
  midus_out %>%
    mutate(group = ifelse(year > p_year, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

midus_out <- tibble(p_year = midus_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) 
## # A tibble: 202,405 x 4
##    p_year   SID name         value
##     <dbl> <dbl> <chr>        <dbl>
##  1   1994 10001 chldbrth        NA
##  2   1994 10001 crim             0
##  3   1994 10001 divorced        NA
##  4   1994 10001 edu             NA
##  5   1994 10001 frstjob         NA
##  6   1994 10001 married         NA
##  7   1994 10001 mntlhlthevnt    NA
##  8   1994 10001 mortality        0
##  9   1994 10001 mvInPrtnr        0
## 10   1994 10001 physhlthevnt     1
## # … with 202,395 more rows

2.8.5 Covariates

## # A tibble: 15,184 x 20
##      SID p_year   age education gender grsWages  race physhlthevnt SRhealth smokes alcohol exercise   BMI married numKids parDivorce PhysFunc religion
##    <dbl>  <dbl> <dbl>     <dbl>  <dbl>    <dbl> <dbl>        <dbl>    <dbl>  <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>      <dbl> <fct>       <dbl>
##  1 10001   1994    53         1      0   300000     0            0      7        1       1     1     26.5       1       3         NA 0               2
##  2 10001   2004    53         1      0   300000     0            1      7        1       1     1     26.4       1       3         NA 0               2
##  3 10002   1994    60         2      0       NA    NA            0     NA       NA       1    NA     NA         1       2         NA <NA>           NA
##  4 10002   2004    60         2      0   235500    NA            1      8       NA       1     2.4   24.1       1       2         NA 0               1
##  5 10004   1994    69         2      0    54000     0            0      7        1       0     4.5   21.4       1       6         NA 0               1
##  6 10005   1994    70         0      1    27500     0            1      9        0       0     3.5   26.5       1       2         NA 1               0
##  7 10005   2004    70         0      1    13750     0            1      8.5      0       0     3.25  26.5       1       2         NA 1               0
##  8 10006   1994    51         1      1    34000     0            1      8        0       0     3.5   26.9       1       2         NA 1               1
##  9 10006   2004    51         1      1    34000     0            1      8        1       0     3.5   26.9       1       2         NA 1               1
## 10 10007   1994    35         0      1    46500     1            0      7        1       0     3.5   24.4       1       2          1 0               1
## # … with 15,174 more rows, and 2 more variables: parEdu <dbl>, parOccPrstg <dbl>

2.8.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  mice(df, m = 5, maxit=5, printFlag=TRUE)
  }
midus_match_imp <- midus_match %>%
  rename(NegAff = `NA`) %>%
  group_by(SID, year) %>%
  mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  select(-momOccPrstg, -dadOccPrstg) %>%
  ungroup() %>%
  mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)

midus_match_imp <- midus_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% mutate(PhysFunc = factor(ifelse(PhysFunc < 2, 1, 0)))),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

midus_match_imp_long <- midus_match_imp %>%
  select(p_year = year, imp_df) %>%
  unnest(imp_df)

midus_SCA_imp <- midus_match_imp %>%
  select(p_year = year, imp_sca) %>%
  unnest(imp_sca) %>%
  left_join(midus_SCA %>% select(p_year, SID,
        colnames(midus_SCA)[!colnames(midus_SCA) %in% colnames(.)]))
unique(specifications$name)[!unique(specifications$name) %in% colnames(midus_SCA_imp)]

2.9 NLSY

The Children to Young Adults Study (CNLSY; Bureau of Labor Statistics, 2017) is an offshoot study of the National Longitudinal Study of Youth (NLSY79), which is an ongoing longitudinal, nationally representative study in the United States. These data are available on the National Bureau of Labour Statistics website dedicated to the NLSY studies by creating a free account (https://www.nlsinfo.org/investigator/pages/login).

Participants included more than 12,500 individuals in the United States that began in 1979. The CNLSY includes the biological children of the NLSY79 participants and began in 1986. Children (10 years and older) completed separate inventories from children (or “young adults”) aged 15 and above. Mothers of children 10 and below also completed surveys on the children prior to age 10. All participants were interviewed in addition to surveys.

Sample sizes vary by year, ranging from approximately 1,331 (1979) to 11,530 (2016). This provides 99/% power to detect a zero-order correlation effect size of ~.05.

2.9.1 Load Data

## # A tibble: 5,339 x 21
##    study dataset category name  itemname year  new_itemname orig_itemname QNAME description scale reverse_code recode  mini  maxi comp_rule item_rule
##    <chr> <lgl>   <chr>    <chr> <chr>    <chr> <chr>        <chr>         <chr> <chr>       <chr> <chr>        <chr>  <dbl> <dbl> <chr>     <chr>    
##  1 nlsy  NA      match    acti… Activit… 2002  nlsy__match… Y1263100      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  2 nlsy  NA      match    acti… Activit… 2004  nlsy__match… Y1496800      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  3 nlsy  NA      match    acti… Activit… 2006  nlsy__match… Y1746700      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  4 nlsy  NA      match    acti… Activit… 2008  nlsy__match… Y2027500      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  5 nlsy  NA      match    acti… Activit… 2010  nlsy__match… Y2352600      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  6 nlsy  NA      match    acti… Activit… 2012  nlsy__match… Y2682100      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  7 nlsy  NA      match    acti… Activit… 2014  nlsy__match… Y3037500      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  8 nlsy  NA      match    acti… Outside… 2002  nlsy__match… Y1263200      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
##  9 nlsy  NA      match    acti… Outside… 2004  nlsy__match… Y1496900      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
## 10 nlsy  NA      match    acti… Outside… 2006  nlsy__match… Y1746800      Q4-3… DOES R BEL… 1 = … <NA>         ifels…    NA    NA max       max      
## # … with 5,329 more rows, and 4 more variables: Include <lgl>, ...19 <lgl>, ...20 <lgl>, ...21 <lgl>
## # A tibble: 88 x 18
##    study dataset category name  itemname year  new_itemname orig_itemname QNAME description scale reverse_code recode mini  maxi  comp_rule item_rule
##    <chr> <lgl>   <chr>    <chr> <chr>    <chr> <chr>        <chr>         <chr> <chr>       <chr> <chr>        <chr>  <lgl> <lgl> <chr>     <chr>    
##  1 nlsy  NA      match    momO… momOcc   1982  <NA>         R0828000      CPSO… OCCUPATION… <NA>  no           ifels… NA    NA    max       max      
##  2 nlsy  NA      match    momO… momOcc   1983  <NA>         R0945300      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
##  3 nlsy  NA      match    momO… momOcc   1984  <NA>         R1255700      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
##  4 nlsy  NA      match    momO… momOcc   1985  <NA>         R1650500      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
##  5 nlsy  NA      match    momO… momOcc   1986  <NA>         R1923100      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
##  6 nlsy  NA      match    momO… momOcc   1987  <NA>         R2317900      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
##  7 nlsy  NA      match    momO… momOcc   1988  <NA>         R2525700      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
##  8 nlsy  NA      match    momO… momOcc   1989  <NA>         R2924700      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
##  9 nlsy  NA      match    momO… momOcc   1990  <NA>         R3127400      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
## 10 nlsy  NA      match    momO… momOcc   1991  <NA>         R3523100      CPSO… OCCUPATION… <NA>  <NA>         ifels… NA    NA    max       max      
## # … with 78 more rows, and 1 more variable: Include <lgl>

2.9.2 Matching Variables

rename_fun <- function(cb, var){
  old.names <- unique((cnlsy_codebook %>% filter(name == var))$orig_itemname)
  df <- nlsy %>% 
    select(SID = C0000100, yearBrth = C0005700, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -yearBrth,
           -SID, na.rm=T)
  if(length(old.names) > 1){
      df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:item_rule))
  } else {
    df %>% left_join(cb %>% select(-(itemname:year),  -new_itemname, -dataset) %>% distinct()) %>% mutate(year = "0")
  }
}

# rename variables   
nlsy_match <- cnlsy_codebook %>%
  filter(category == "match" & year != "XRND") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  filter(name != "yearBrth") %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() 

# single time point variables 
old.names <- (cnlsy_codebook %>% filter(category == "match" & year == "XRND"))$orig_itemname 
nlsy_match_stp <- nlsy %>%
  select(SID = C0000100, yearBrth = C0005700, one_of(old.names)) %>%
  gather(key = orig_itemname, value = value, -SID, -yearBrth, na.rm=T) %>%
  left_join(cnlsy_codebook %>% select(name:year, orig_itemname, recode, comp_rule, item_rule))
# recode 

recode_fun <- function(rule, y, p_year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

nlsy_match <- nlsy_match %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

nlsy_waves <- p_waves %>% filter(Study == "NLSY") %>% select(Used) %>% distinct()

nlsy_match_stp <- nlsy_match_stp %>%
  full_join(crossing(p_year = nlsy_waves$Used, year = "XRND")) %>%
  group_by(recode, p_year, year) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data)

# reverse code 
nlsy_match <- nlsy_match %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

occ_recode_fun <- function(year, df){
  if(year == 2000){df %>% mutate(value = mapvalues(value, npb$OCC00, npb$NPB))}
  else if(year == 1990){
    df %>% mutate(value = mapvalues(value, occ90to00$OCC90, occ90to00$OCC00), 
                  value = mapvalues(value, npb$OCC00, npb$NPB),
                  value = ifelse(value > 100, NA, value))
  } else{
    df %>% mutate(value = mapvalues(value, occ70to90$OCC70, occ70to90$OCC90),
                  value = mapvalues(value, occ90to00$OCC90, occ90to00$OCC00), 
                  value = mapvalues(value, npb$OCC00, npb$NPB),
                  value = ifelse(value > 100, NA, value))
  }
}

nlsy_match_occ <- nlsy_match %>%
  filter(grepl("Occ", name)) %>%
  filter(!is.na(value) & year >= 1984) %>% 
  mutate(group = ifelse(year < 2000, 1970, ifelse(year >= 2002, 2000, 1990))) %>%
  group_by(group) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(group, data, occ_recode_fun)) %>%
  unnest(data) %>%
  mutate(name = paste(name, "Prstg", sep = "")) %>%
  select(-group)

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T),
           multiply = prod(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, yearBrth, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

nlsy_match <- nlsy_match %>%
  filter(year <= max(nlsy_waves$Used) & !grepl("Occ", name)) %>%
  full_join(nlsy_match_occ) %>%
  group_by(comp_rule, item_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(item_rule = ifelse(is.na(item_rule), "skip", item_rule),
         data = map2(data, item_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  nlsy_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    full_join(
      nlsy_match_stp %>% 
        select(SID, yearBrth, p_year, name, value, comp_rule) %>%
        filter(comp_rule == rule & p_year == p_year)
      ) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

nlsy_match <- crossing(
  p_year = nlsy_waves$Used, 
  comp_rule = unique(nlsy_match$comp_rule)
  ) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
         data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max))
## # A tibble: 34,590 x 120
## # Groups:   SID [11,530]
##      SID dadEdu momEdu yearBrth  year     A     C    DEP     E    IQ   LOC     N     O    SE    SS activity AgeBegDate batheSelf childhoodSS chores
##    <dbl>  <dbl>  <dbl>    <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>       <dbl>  <dbl>
##  1   201     NA      1     1993  2000    NA  NA   NA      NA    17.6 NA     NA    NA    NA   NA          NA       NA           1       112.    2.75
##  2   201     NA      1     1993  2006    NA  NA   NA      NA    NA   NA     NA    NA    NA   NA          NA       NA           3       118.    3.55
##  3   201     NA      1     1993  2008    NA  NA   NA      NA    NA   NA     NA    NA    NA   NA          NA       NA           3       118.    3.55
##  4   202     NA      1     1994  2000    NA  NA   NA      NA    16.3 NA     NA    NA    NA   NA          NA       NA          NA       103.   NA   
##  5   202     NA      1     1994  2006    NA  NA   NA      NA    NA   NA     NA    NA    NA   NA          NA       NA           5       108.    3.9 
##  6   202     NA      1     1994  2008    NA  NA   NA      NA    NA   NA     NA    NA    NA   NA          NA       NA           5       108.    4.18
##  7   301     NA      1     1981  2008     2   3.5  0.571   2.5  NA    1.14   0.5   3.5   1.7  3.67       NA       12.5         3       112.    4.53
##  8   301     NA      1     1981  2000    NA  NA   NA      NA    NA   NA     NA    NA    NA   NA          NA       12.5         3       112.    4.53
##  9   301     NA      1     1981  2006    NA  NA   NA      NA    NA   NA     NA    NA    NA   NA          NA       12.5         3       112.    4.53
## 10   302     NA      1     1983  2000    NA  NA   NA      NA    NA   NA     NA    NA    NA    0          NA       11.5         5        97.5   4.75
## # … with 34,580 more rows, and 100 more variables: compliance <dbl>, difficulty <dbl>, discipline <dbl>, fearfulness <dbl>, foodChoice <dbl>,
## #   FreqSmk <dbl>, genRoles <dbl>, grsWages <dbl>, hsGrades <dbl>, Impls <dbl>, insecureAttach <dbl>, manageTIme <dbl>, nbhdQual <dbl>,
## #   parAffection <dbl>, parLimitations <dbl>, parTalkTV <dbl>, peerPressure <dbl>, positiveAffect <dbl>, praisedByPar <dbl>, reads <dbl>, relDad <dbl>,
## #   relMom <dbl>, RelProb <dbl>, relSat <dbl>, satJob <dbl>, satSchl <dbl>, SensSeek <dbl>, SRhealth <dbl>, timeHW <dbl>, timeWOthrs <dbl>,
## #   TVtime <dbl>, ADHD <dbl>, alcohol <dbl>, behProb <dbl>, BPI <dbl>, crprlPunish <dbl>, dadAlive <dbl>, dropOutSchl <dbl>, drugs <dbl>,
## #   dyslexia <dbl>, edu <dbl>, education <dbl>, employed <dbl>, familyProb <dbl>, height <dbl>, hlthcare <dbl>, hlthProbs <dbl>, Jail <dbl>,
## #   learnDis <dbl>, learningDis <dbl>, married <dbl>, momAge <dbl>, momAlcPreg <dbl>, momAlive <dbl>, momCocPreg <dbl>, momOccPrstg <dbl>,
## #   momRedAlcPreg <dbl>, momRedCalPreg <dbl>, momRedSaltPreg <dbl>, momRedSmkPreg <dbl>, momSmkPreg <dbl>, momVitPreg <dbl>, momWeedPreg <dbl>,
## #   musicInst <dbl>, nightmares <dbl>, numKids <dbl>, numSiblng <dbl>, parDivorce <dbl>, physAct <dbl>, probation <dbl>, race <dbl>, religion <dbl>,
## #   remedialClass <dbl>, seeDad <dbl>, shynessProb <dbl>, sleepProb <dbl>, smokes <dbl>, unempEen <dbl>, weight <dbl>, welfare <dbl>, dadOccPrstg <dbl>,
## #   HHsize <dbl>, liveDad <dbl>, ParEngmnt <dbl>, privateSchl <dbl>, urban <dbl>, readingMat <dbl>, tantrums <dbl>, ageMarried <dbl>, bornLate <dbl>,
## #   durBreastFed <dbl>, gender <dbl>, dscplnFreq <dbl>, eatingHabits <dbl>, exercise <dbl>, activities <dbl>, Anger <dbl>, physhlthevnt <dbl>,
## #   age <dbl>, BMI <dbl>

2.9.3 Personality Variables

## # A tibble: 77,697 x 5
## # Groups:   SID, yearBrth, name [63,435]
##      SID yearBrth name  year   value
##    <dbl>    <dbl> <chr> <chr>  <dbl>
##  1   201     1993 IQ    2000  17.6  
##  2   202     1994 IQ    2000  16.3  
##  3   301     1981 A     2008   2    
##  4   301     1981 C     2008   3.5  
##  5   301     1981 DEP   2008   0.571
##  6   301     1981 E     2008   2.5  
##  7   301     1981 LOC   2008   1.14 
##  8   301     1981 N     2008   0.5  
##  9   301     1981 O     2008   3.5  
## 10   301     1981 SE    2008   1.7  
## # … with 77,687 more rows

2.9.4 Outcome Variables

nlsy_out <- cnlsy_codebook %>%
  filter(category == "out" & year != "XRND") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  filter(name != "yearBrth") %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() %>%
  full_join(crossing(p_year = nlsy_waves$Used, name = unique((.)$name)))

old.names <- (cnlsy_codebook %>% filter(category == "out" & year == "XRND"))$orig_itemname 
nlsy_out_stp <- nlsy %>%
  select(SID = C0000100, yearBrth = C0005700, one_of(old.names)) %>%
  gather(key = orig_itemname, value = value, -SID, -yearBrth, na.rm=T) %>%
  left_join(cnlsy_codebook %>% filter(category == "out") %>% 
              select(name:year, orig_itemname, recode, comp_rule, item_rule))

# recode
recode_fun <- function(rule, y, p_year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# coding unemployment
jobs <- nlsy_out %>%
  filter(name == "unemployed" & (grepl("start", itemname) | grepl("end", itemname))) %>%
  select(name, SID, value, itemname, year, p_year) %>%
  mutate(value = ifelse(is.na(value) | value < 0, NA, value),
         timing = ifelse(grepl("start", itemname), "start", "end"),
         itemname = str_remove(itemname, "start"), itemname = str_remove(itemname, "end")) %>%
  filter(!is.na(value)) %>%
  separate(itemname, c("time", "job"), -1) %>%
  spread(time, value) %>%
  mutate(centMnth = (Year - 1950)*12 + Month) %>%
  select(-Year, -Month) %>%
  spread(timing, centMnth) %>%
  group_by(SID, year, p_year) %>%
  mutate(lead_end = lead(end),
         gap = start-lead_end) %>%
  group_by(name, SID, year, p_year, job) %>%
  summarize(value = ifelse(gap > 1, 1, 0)) %>%
  group_by(name, SID, year, p_year) %>%
  summarize(value = max(value, na.rm = T))

nlsy_out <- nlsy_out %>%
  filter(!(name == "unemployed" & (grepl("start", itemname) | grepl("end", itemname)))) %>%
  select(-orig_itemname) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data) %>%
  distinct()

nlsy_waves <- p_waves %>% filter(Study == "NLSY") %>% select(Used) %>% distinct()

nlsy_out_stp <- nlsy_out_stp %>%
  full_join(crossing(p_year = nlsy_waves$Used, year = "XRND")) %>%
  group_by(recode, p_year, year) %>%
  nest() %>% 
  ungroup() %>%
  mutate(recode = str_replace(recode, "if\\(p_years", "ifelse\\(p_year"),
         data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) %>%
  select(p_year, SID, name, value)

nlsy_out_stp <- nlsy_out_stp %>%
  filter(name == "education") %>%
  mutate(name = "edu") %>%
  group_by(name, p_year, SID) %>%
  summarize(value = max(value, na.rm = T))%>%
  ungroup() %>%
  full_join(nlsy_out_stp %>% filter(name != "education"))

# composite within years
nlsy_out <- nlsy_out %>%
  select(name:year) %>%
  # filter(year <= max(gsoep_waves$Used)) %>%
  group_by(SID, name, year, yearBrth) %>%
  summarize(value = max(value, na.rm = T)) %>% 
  ungroup() %>%
  mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))

# composite across years
comp_fun <- function(P_year){
  nlsy_out %>%
    left_join(jobs %>% filter(p_year == P_year) %>% select(-p_year)) %>%
    mutate(group = ifelse(year > P_year, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

nlsy_out <- tibble(p_year = nlsy_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) %>%
  full_join(nlsy_out_stp)
## # A tibble: 553,440 x 6
##    p_year   SID name         future  past value
##     <dbl> <dbl> <chr>         <dbl> <dbl> <dbl>
##  1   2006   201 chldbrth         NA    NA    NA
##  2   2006   201 chldmvout        NA    NA    NA
##  3   2006   201 crim              0    NA     0
##  4   2006   201 divorced         NA    NA    NA
##  5   2006   201 frstjob          NA    NA    NA
##  6   2006   201 mntlhlthevnt     NA    NA    NA
##  7   2006   201 mortality         0     0     0
##  8   2006   201 physhlthevnt     NA    NA    NA
##  9   2006   201 separated        NA    NA    NA
## 10   2006   201 unemployed       NA    NA    NA
## # … with 553,430 more rows

2.9.5 Covariates

nlsy_match <- nlsy_pers %>%
  ungroup() %>%
  mutate(year = as.numeric(year)) %>%
  spread(name, value) %>% 
  full_join(nlsy_match %>% rename(year = p_year)) %>%
  # physical health event
  left_join(nlsy_out %>% filter(name == "physhlthevnt") %>% select(year = p_year, SID, physhlthevnt = past) %>% distinct() %>% group_by(SID, year) %>% summarize(physhlthevnt = max(physhlthevnt)) %>% ungroup()) %>%
  distinct()

# age
nlsy_match <- nlsy_match %>%
  mutate(age = year - yearBrth, 
         BMI = weight/ (height/100)^2)

nlsy_match <- nlsy_match %>%
  group_by(year) %>%
  mutate_at(vars(education, numKids, married, smokes, alcohol), ~ifelse(is.na(.) & age < 19, 0, .)) 

nlsy_match <- nlsy_match %>%
  ungroup() %>%
  select(SID, dadEdu, momEdu) %>%
  distinct()  %>%
  gather(item,value, na.rm = T, -SID) %>%
  group_by(SID, item) %>%
  summarize(value = max(value, na.rm = T)) %>%
  spread(item,value) %>%
  full_join(nlsy_match %>% select(-momEdu, -dadEdu))

nlsy_out <- nlsy_out %>%
  select(-past, -future) %>%
  distinct() 

nlsy_SCA <- nlsy_match %>% rename(PhysFunc = physAct) %>%
  select(SID, p_year = year, one_of(c(unique(specifications$name), "momOccPrstg", "dadOccPrstg", "momEdu", "dadEdu"))) %>%
  group_by(SID) %>%
  mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T),
         parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  mutate_at(vars(parEdu, parOccPrstg), ~ifelse(is.infinite(.), NA, .)) %>%
  ungroup() %>% 
  select(-momEdu, -dadEdu, -momOccPrstg, -dadOccPrstg)

unique(specifications$name)[!unique(specifications$name) %in% colnames(nlsy_match)]
## # A tibble: 34,590 x 20
##      SID p_year   age education gender grsWages  race physhlthevnt SRhealth smokes alcohol exercise   BMI married numKids parDivorce PhysFunc religion
##    <dbl>  <dbl> <dbl>     <dbl>  <dbl>    <dbl> <dbl>        <dbl>    <dbl>  <dbl>   <dbl>    <dbl> <dbl>   <dbl>   <dbl>      <dbl>    <dbl>    <dbl>
##  1   201   2000     7         0      1   32964.    NA           NA     1         0       0       NA  22.5       0       0          0        0       NA
##  2   201   2006    13         0      1   34682.    NA           NA     1         0       0       NA  25.4       0       0          0        0        1
##  3   201   2008    15         0      1   32837.    NA           NA     1         0       0       NA  25.4       0       0          0        0        1
##  4   202   2000     6         0      1   32964.    NA           NA     1         0       0       NA  22.5       0       0          0        0       NA
##  5   202   2006    12         0      1   34682.    NA           NA     1         0       0       NA  33.8       0       0          0        0       NA
##  6   202   2008    14         0      1   32837.    NA           NA     1         0       0       NA  33.8       0       0          0        0       NA
##  7   301   2008    27         0      1   29323.     2           NA     2         0      NA        4  21.0       0      NA          0        0        1
##  8   301   2000    19         0      1   28167.     2           NA     1         0      NA       NA  18.6       0      NA          0        0        1
##  9   301   2006    25         0      1   27278.     2           NA     1.67      0      NA       NA  18.6       0      NA          0        0        1
## 10   302   2000    17         0      1   27311.     2           NA     2         1       1       NA  20.6       0       0          0        0        1
## # … with 34,580 more rows, and 2 more variables: parEdu <dbl>, parOccPrstg <dbl>

2.9.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_na <- function(x) any(!is.na(x))
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  mice(df, m = 5, maxit=5, printFlag=TRUE, method = "cart")
  }
nlsy_match_imp2 <- nlsy_match %>%
  group_by(SID, year) %>%
  mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  select(-momOccPrstg, -dadOccPrstg) %>%
  ungroup() %>%
  mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)

nlsy_match_imp <- nlsy_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% rename(PhysFunc = physAct)),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)), hlthProbs) %>% group_by(SID) %>% mutate(physhlthevnt = max(cbind(physhlthevnt, hlthProbs))-1) %>% ungroup() %>% select(-hlthProbs)))

nlsy_match_imp_long <- nlsy_match_imp %>%
  mutate(imp_df = map(imp_df, ~(.) %>% mutate_if(is.factor, ~as.numeric(as.character(.))))) %>%
  select(p_year = year, imp_df) %>%
  mutate(imp_df = map(imp_df, ~if(any(colnames(.) == "relSat")){(.) %>% mutate(relSat = as.numeric(as.character(relSat)))})) %>%
  unnest(imp_df)

nlsy_SCA_imp <- nlsy_match_imp %>%
  select(p_year = year, imp_sca) %>%
  unnest(imp_sca) %>%
  left_join(nlsy_SCA %>% select(p_year, SID,
        colnames(nlsy_SCA)[!colnames(nlsy_SCA) %in% colnames(.)])) %>%
  left_join(nlsy_match %>% select(SID, p_year = year)) %>%
  mutate(physhlthevnt = factor(physhlthevnt))
unique(specifications$name)[!unique(specifications$name) %in% colnames(nlsy_SCA_imp)]

2.10 SHP

The Swiss Household Panel Study (SHP; Voorpostel et al., 2016) “Living in Switzerland” is an ongoing longitudinal study of households in Switzerland. These data are available online, through application from https://forsbase.unil.ch/project/study-public-overview/15632/0/.

Participants were recruited from more than 10,000 individuals from the households whose members represent the non-institutional resident population of Switzerland. Data have been collected annually since 1999. The latest data release includes data up to 2018. On average, about 5,000 individuals are sampled at each wave. More documentation can be found at LINK, but, in short, the SHP is a nationally representative sample of Swiss citizens.

Sample sizes vary by year, ranging from 5,220 (2003) to 13,295 (2013). This provides 99/% power to detect a zero-order correlation effect size of ~.06, two tailed at alpha .05.

2.10.1 Load Data

## # A tibble: 1,493 x 18
##    study dataset category name   itemname  wave   year new_itemname orig_itemname stem  description scale reverse_code recode  mini  maxi comp_rule ...18
##    <chr> <chr>   <chr>    <chr>  <chr>     <chr> <dbl> <chr>        <chr>         <chr> <chr>       <chr> <chr>        <chr>  <dbl> <dbl> <chr>     <lgl>
##  1 shp   P       match    dadAl… dadalive  15     2013 shp__match_… P13N82        N82   "Father: a… "-8\… <NA>         ifels…    NA    NA max       NA   
##  2 shp   P       match    dadAl… dadalive  18     2016 shp__match_… P16N82        N82   "Father: a… "-8\… <NA>         ifels…    NA    NA max       NA   
##  3 shp   P       match    dadEdu dadhighe… 0         0 shp__match_… P$$O17        O17   "Highest l… "-8 … <NA>         ifels…    NA    NA max       NA   
##  4 shp   P       match    degPh… hlthimpe… 1      1999 shp__match_… P99C08        C08   "Health im… "-8\… <NA>         ifels…    NA    NA max       NA   
##  5 shp   P       match    degPh… hlthimpe… 2      2000 shp__match_… P00C08        C08   "Health im… "-8\… <NA>         ifels…    NA    NA max       NA   
##  6 shp   P       match    degPh… hlthimpe… 3      2001 shp__match_… P01C08        C08   "Health im… "-8\… <NA>         ifels…    NA    NA max       NA   
##  7 shp   P       match    degPh… hlthimpe… 4      2002 shp__match_… P02C08        C08   "Health im… "-8\… <NA>         ifels…    NA    NA max       NA   
##  8 shp   P       match    degPh… hlthimpe… 5      2003 shp__match_… P03C08        C08   "Health im… "-8\… <NA>         ifels…    NA    NA max       NA   
##  9 shp   P       match    degPh… hlthimpe… 6      2004 shp__match_… P04C08        C08   "Health im… "-8\… <NA>         ifels…    NA    NA max       NA   
## 10 shp   P       match    degPh… hlthimpe… 7      2005 shp__match_… P05C08        C08   "Health im… "-8\… <NA>         ifels…    NA    NA max       NA   
## # … with 1,483 more rows

2.10.2 Matching Variables

rename_fun <- function(cb, var){
  print(var)
  old.names <- unique((shp_codebook %>% filter(name == var))$orig_itemname)
  df <- shp %>% 
    select(SID = IDPERS, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -SID, na.rm=T)
  if(length(old.names) > 1){
      df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule))
  } else {
    df %>% left_join(cb %>% select(-(itemname:year),  -new_itemname, -dataset) %>% distinct()) %>% mutate(year = "0")
  }
}

# rename variables   
shp_match <- shp_codebook %>%
  filter(category == "match" & year != 0) %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  # filter(name != "yearBrth") %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() 

# single time point variables 
old.names <- (shp_codebook %>% filter(category == "match" & year == "0"))$orig_itemname 
shp_match_stp <- shp %>%
  select(SID = IDPERS, one_of(old.names)) %>%
  gather(key = orig_itemname, value = value, -SID) %>%
  left_join(shp_codebook %>% select(name:year, orig_itemname, recode, comp_rule)) 

# recoding
recode_fun <- function(rule, y, p_year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

yrBrth <- shp_match_stp %>% filter(name == "yearBrth") %>%
  mutate(yearBrth = ifelse(value < 0, NA, value)) %>%
  select(SID, yearBrth) 

shp_match <- shp_match %>%
  left_join(yrBrth) %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(recode = mapvalues(recode, "ifelse(is.na(x) | x < 0), NA, ifelse(x > 0, 1, 0))", "ifelse(is.na(x) | x < 0, NA, ifelse(x > 0, 1, 0))"),
         recode = mapvalues(recode, "ifelse(is.na(x) | x < 0, ifelse(x %in% 7:14, 0, 1))", "ifelse(is.na(x) | x < 0, NA, ifelse(x %in% 7:14, 0, 1))"),
         data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

shp_waves <- p_waves %>% filter(Study == "SHP") %>% select(Used) %>% distinct()

shp_match_stp <- shp_match_stp %>%
  full_join(crossing(p_year = shp_waves$Used, year = 0)) %>%
  left_join(yrBrth) %>%
  group_by(recode, p_year, year) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data)

# reverse code 
shp_match <- shp_match %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T),
           multiply = prod(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, yearBrth, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

shp_match <- shp_match %>%
  filter(year <= max(shp_waves$Used)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
         data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  shp_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    left_join(
      shp_match_stp %>% 
        select(SID, yearBrth, p_year, name, value, comp_rule) %>%
        filter(comp_rule == rule & p_year == p_year)
      ) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

shp_match <- crossing(
  p_year = shp_waves$Used, 
  comp_rule = unique(shp_match$comp_rule)
  ) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
         data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  full_join(shp_match_stp %>% select(SID, yearBrth, name, value, p_year)) %>%
  filter(!is.na(value)) %>%
  distinct() %>%
  spread(name,value)
## # A tibble: 139,867 x 56
##     year   SID     A     C   DEP     E   LOC     N  `NA`     O    OP    PA    SE    SS   SWL ageMarried dadEdu dadOccPrstg degPhysFunc disability
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>      <dbl>  <dbl>       <dbl>       <dbl>      <dbl>
##  1  2000  4101    NA    NA     0    NA    NA    NA    NA    NA     9    NA    NA  9       10         24      0        73.9           0          0
##  2  2000  4102    NA    NA     5    NA    NA    NA    NA    NA     7    NA    NA 10       10         21      0        36.3           8          1
##  3  2000  5101    NA    NA     1    NA    NA    NA    NA    NA    10    NA    NA  6.5      8         38      0        39.6           1          0
##  4  2000  6101    NA    NA     8    NA    NA    NA    NA    NA     0    NA    NA  4.75     5         NA      0        38.0          10          1
##  5  2000 13102    NA    NA     3    NA    NA    NA    NA    NA     7    NA    NA  8.7      9         15      0        51.6           0          0
##  6  2000 14101    NA    NA     0    NA    NA    NA    NA    NA     8    NA    NA  5.88     8         24      0        52.1           3          1
##  7  2000 26101    NA    NA     2    NA    NA    NA    NA    NA     0    NA    NA  8.12     5         30      0        43.4           6          1
##  8  2000 26102    NA    NA     0    NA    NA    NA    NA    NA     9    NA    NA  7.5     10         25      0        44.2           0          1
##  9  2000 27101    NA    NA     0    NA    NA    NA    NA    NA     9    NA    NA  7.5      8         17      0        NA             6          0
## 10  2000 27102    NA    NA     1    NA    NA    NA    NA    NA     0    NA    NA  7        8         24      0        52.5           5          0
## # … with 139,857 more rows, and 36 more variables: drVisits <dbl>, education <dbl>, employed <dbl>, exercise <dbl>, gender <dbl>, grsWages <dbl>,
## #   height <dbl>, HHID <dbl>, HHsize <dbl>, hlthcare <dbl>, imprvHealth <dbl>, loneliness <dbl>, married <dbl>, meaning <dbl>, momEdu <dbl>,
## #   momOccPrstg <dbl>, numKids <dbl>, parDivorce <dbl>, physAct <dbl>, PhysFunc <dbl>, race <dbl>, religion <dbl>, religiosity <dbl>, satFinances <dbl>,
## #   satFncChng <dbl>, satHealth <dbl>, satHH <dbl>, satSchl <dbl>, SRhealth <dbl>, unemployBen <dbl>, weight <dbl>, welfare <dbl>, yearBrth <dbl>,
## #   physhlthevnt <dbl>, age <dbl>, BMI <dbl>

2.10.3 Personality Variables

shp_pers <- shp_codebook %>%
  filter(category == "pers") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  filter(name != "yearBrth") %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() %>%
  left_join(p_waves %>% filter(Study == "SHP") %>% select(name = p_item, Used)) %>%
  filter(year %in% Used)

recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# recode 
shp_pers <- shp_pers %>%
  select(-orig_itemname) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data) 

# reverse code 
shp_pers <- shp_pers %>%
  mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), 
                        value, reverse.code(-1, value, mini = mini, maxi = maxi)))

# alpha's
shp_alpha <- shp_pers %>%
  select(name, itemname, year, SID, value, comp_rule) %>%
  group_by(name, year, SID, comp_rule, itemname) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  group_by(name, year, comp_rule) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(value = mean))),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

# create composites
comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

# create composites
shp_pers <- shp_pers %>%
  group_by(name, comp_rule, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "average", comp_rule),
         data = map2(data, comp_rule, comp_fun)) %>%
  unnest()
## # A tibble: 187,730 x 5
##    name   year comp_rule   SID value
##    <chr> <dbl> <chr>     <dbl> <dbl>
##  1 A      2009 average    4101   7  
##  2 A      2009 average    4102   5  
##  3 A      2009 average    4103   1.5
##  4 A      2009 average    4104   5  
##  5 A      2009 average    5101   5.5
##  6 A      2009 average    5104   5  
##  7 A      2009 average   21101   7.5
##  8 A      2009 average   26101  10  
##  9 A      2009 average   27101   6  
## 10 A      2009 average   27102   5.5
## # … with 187,720 more rows

2.10.4 Outcome Variables

shp_out <- shp_codebook %>%
  filter(category == "out" & year != "0") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() %>%
  full_join(crossing(p_year = shp_waves$Used, name = unique((.)$name))) 

old.names <- (shp_codebook %>% filter(category == "out" & year == "0"))$orig_itemname 
shp_out_stp <- shp %>%
  select(SID = IDPERS, one_of(old.names)) %>%
  gather(key = orig_itemname, value = value, -SID) %>%
  left_join(shp_codebook %>% select(name:year, orig_itemname, recode, comp_rule))

# recode
recode_fun <- function(rule, y, p_year, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

shp_out <- shp_out %>%
  left_join(yrBrth) %>%
  select(-orig_itemname) %>%
  group_by(recode, year, p_year) %>%
  nest() %>%
  ungroup() %>%
  mutate(year = as.numeric(year),
         data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
  unnest(data) %>%
  distinct()

shp_waves <- p_waves %>% filter(Study == "SHP") %>% select(Used) %>% distinct()

shp_out_stp <- shp_out_stp %>%
  full_join(crossing(p_year = shp_waves$Used, year = 0)) %>%
  group_by(recode, p_year, year) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) %>%
  left_join(yrBrth)

# composite within years
shp_out <- shp_out %>%
  select(year, p_year:itemname, yearBrth) %>%
  distinct() %>%
  # filter(year <= max(gsoep_waves$Used)) %>%
  group_by(SID, name, year, yearBrth, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>% 
  ungroup() %>%
  mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))

shp_out <- shp_out %>% 
  filter(name == "chldmvout") %>%
  group_by(SID, p_year) %>%
  mutate(max = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(value < max, 1, 0)) %>%
  select(-max) %>%
  full_join(shp_out %>% filter(name != "chldmvout"))

# composite across years
comp_fun <- function(P_year){
  shp_out %>%
    mutate(group = ifelse(year > P_year, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

shp_out <- tibble(p_year = shp_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) %>%
  distinct() %>%
  full_join(shp_out_stp %>% select(p_year, name, SID, value)) %>%
  distinct() %>%  left_join(yrBrth)
## # A tibble: 1,323,120 x 7
##    p_year   SID name         future  past value yearBrth
##     <dbl> <dbl> <chr>         <dbl> <dbl> <dbl>    <dbl>
##  1   2009  4101 chldbrth          1     1    NA     1965
##  2   2009  4101 chldmvout         1     1    NA     1965
##  3   2009  4101 divorced          0     0     0     1965
##  4   2009  4101 edu               1     0     1     1965
##  5   2009  4101 frstjob           0     0     0     1965
##  6   2009  4101 married           0     0     0     1965
##  7   2009  4101 mortality         0     0     0     1965
##  8   2009  4101 mvInPrtnr         1     1    NA     1965
##  9   2009  4101 physhlthevnt      0     0     0     1965
## 10   2009  4101 vlntred           0     1    NA     1965
## # … with 1,323,110 more rows

2.10.5 Covariates

## # A tibble: 139,867 x 18
##      SID p_year   age education gender grsWages  race physhlthevnt SRhealth exercise   BMI married numKids parDivorce PhysFunc religion parEdu
##    <dbl>  <dbl> <dbl>     <dbl>  <dbl>    <dbl> <dbl>        <dbl>    <dbl>    <dbl> <dbl>   <dbl>   <dbl>      <dbl>    <dbl>    <dbl>  <dbl>
##  1  4101   2000    35         0      0    57750     0            0      1          1    NA       1       3          0        0        1      0
##  2  4102   2000    32         0      1    57750     0            1      3          2    NA       1       3          0        1        1      0
##  3  5101   2000    39         1      0    86700     0            1      2          2    NA       1       2          0        1        0      0
##  4  6101   2000    36         0      1    49400     0            0      3.5        0    NA       0       0          0        1        1      0
##  5 13102   2000    27         0      1    34270     0            0      2          2    NA       0       2          0        1        1      0
##  6 14101   2000    66         0      0    72130     0            0      2          2    NA       1       0          0        1        1      0
##  7 26101   2000    78         0      1    57400     0            0      3          4    NA       1       0          0        1        1      0
##  8 26102   2000    75         0      0    57400     0            0      2          7    NA       1       0          0        1        1      0
##  9 27101   2000    32         0      1    37050     0            0      3          7    NA       1       3          0        1        1      0
## 10 27102   2000    32         0      0    37050     0            1      1          3    NA       1       3          0        1        1      0
## # … with 139,857 more rows, and 1 more variable: parOccPrstg <dbl>

2.10.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  mice(df, m = 5, maxit=5, method = "cart", printFlag=TRUE)
  }
shp_match_imp <- shp_match %>%
  rename(NegAff = `NA`) %>%
  group_by(SID, year) %>%
  mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  select(-momOccPrstg, -dadOccPrstg) %>%
  ungroup() %>%
  mutate_all(~ifelse(is.infinite(.), NA, .)) %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na) %>% select_if(not_all_id)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)

shp_match_imp <- shp_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

shp_match_imp_long <- shp_match_imp %>%
  mutate(imp_df = map(imp_df, ~(.) %>% mutate_if(is.factor, ~as.numeric(as.factor(.))))) %>%
  select(p_year = year, imp_df) %>%
  mutate(imp_df = map(imp_df, ~if(any(colnames(.) == "relSat")){(.) %>% mutate(relSat = as.numeric(as.character(relSat))); (.)}else{(.)})) %>%
  unnest(imp_df)

shp_SCA_imp <- shp_match_imp %>%
  select(p_year = year, imp_sca) %>%
  unnest(imp_sca) %>%
  left_join(shp_SCA %>% select(p_year=year, SID,
        colnames(shp_SCA)[!colnames(shp_SCA) %in% colnames(.)])) 
shp_SCA_imp <- shp_SCA_imp %>%
  select(-BMI) %>%
  left_join(shp_SCA_imp %>%
    select(SID, BMI) %>%
    distinct() %>%
    gather(item, value, -SID, na.rm =  T) %>%
    group_by(SID,item) %>%
    summarize(value = mean(value, na.rm = T)) %>%
    spread(item,value))
unique(specifications$name)[!unique(specifications$name) %in% colnames(shp_SCA_imp)]

2.11 WLS

The Wisconsin Longitudinal Study (WLS) is an ongoing longitudinal study of individuals who graduated from Wisconsin high schools in 1957 and were born between 1937 and 1940 as well as their siblings.

Graduates were randomly recruited from Wisconsin high schools in 1957 and born between 1937 and 1940. In 1977, at least one sibling of the original graduates from 2,100 families were also invited to participate in the study. As such, the study is representative of older, white Americans who have at least a high school education. Graduate data have been collected in in 1957, 1964, 1975, 1992, 2004, and 2011, and sibling data have been collected in 1977, 1994, 2005, and 2011. Personality data were initially collected in 1992 for graduates and 1994 for siblings. More documentation can be found at https://www.ssc.wisc.edu/wlsresearch/.

Sample sizes vary by wave, from 9,681 (2011) to 10,317 (1957). This provides 99/% power to detect zero-order correlation effect sizes of ~.06, two-tailed at alpha .05.

2.11.1 Load Data

## # A tibble: 834 x 19
##    study dataset category name  itemname year  new_itemname orig_itemname description scale reverse_code recode  mini  maxi comp_rule ...16 ...17 ...18
##    <chr> <chr>   <chr>    <chr> <chr>    <chr> <chr>        <chr>         <chr>       <chr> <chr>        <chr>  <dbl> <dbl> <chr>     <chr> <chr> <chr>
##  1 wlss  1992 -… match    ageF… ageFrst… 1992  wlss__match… ru020re       What age w… ".\t… <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  2 wlss  1993 -… match    ageF… ageFrst… 1993  wlss__match… su020re       What age w… ".\t… <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  3 wlss  1992 -… match    alco… alcohol  1992  wlss__match… ru026re       During the…  <NA> <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  4 wlss  1993 -… match    alco… alcohol  1993  wlss__match… su026re       During the…  <NA> <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  5 wlss  1992 -… match    anem… anemia   1992  wlss__match… mx083rer      Has a medi… ".\t… <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  6 wlss  1993 -… match    anem… anemia   1992  wlss__match… nx103rer      Has a medi… "\".… <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  7 wlss  1992 -… match    appr… apprncR… 1992  wlss__match… mx004rer      Compared w… ".\t… <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  8 wlss  1993 -… match    appr… apprncR… 1993  wlss__match… nx004rer      Compared w… ".\t… <NA>         ifels…    NA    NA skip      <NA>  <NA>  <NA> 
##  9 wlss  1992-1… match    auth… afraidV… 1992  wlss__match… mn007rer      “I am not … "1 a… yes          ifels…     1     6 average   <NA>  <NA>  <NA> 
## 10 wlss  1992-1… match    auth… afraidV… 1992  wlss__match… np007rer      “I am not … "1 a… yes          ifels…     1     6 average   <NA>  <NA>  <NA> 
## # … with 824 more rows, and 1 more variable: sib <chr>

2.11.2 Matching Variables

rename_fun <- function(cb, var){
  old.names <- unique((wls_codebook %>% filter(name == var))$orig_itemname)
  df <- wls %>% 
    select(SID = idpub, HHID = familypub, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -SID, -HHID, na.rm=T)
  if(length(old.names) > 1){
      df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:comp_rule, sib))
  } else {
    df %>% left_join(cb %>% select(-(itemname:year),  -new_itemname, -dataset) %>% distinct()) %>% mutate(year = "0")
  }
}

# rename variables   
wls_match <- wls_codebook %>%
  filter(category == "match" & year != 0) %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  # filter(name != "yearBrth") %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() 

# single time point variables 
old.names <- (wls_codebook %>% filter(category == "match" & year == "0"))$orig_itemname 
wls_match_stp <- wls %>%
  select(SID = idpub, HHID = familypub, one_of(old.names)) %>%
  gather(key = orig_itemname, value = value, -SID, -HHID, na.rm=T) %>%
  left_join(wls_codebook %>% select(name:year, orig_itemname, recode, comp_rule, sib)) %>%
  unite(SID, SID, sib, sep = "")

# recoding
recode_fun <- function(rule, y, p_year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

yrBrth <- wls_match_stp %>% filter(name == "yearBrth") %>%
  mutate(yearBrth = ifelse(value < 0, NA, value +1900)) %>%
  select(SID, HHID, yearBrth) 

wls_match <- wls_match %>%
  unite(SID, SID, sib, sep = "") %>%
  left_join(yrBrth) %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

wls_match_par <- wls_match %>% filter(grepl("mom", name) | 
        grepl("dad", name) | name == "parGrsWages") %>%
  group_by(year, HHID, name, comp_rule) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value),
         comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
  filter(!is.na(value)) %>%
  group_by(HHID, name) %>%
  summarize(value = fun_call(value, comp_rule[1])) %>%
  ungroup() %>%
  spread(name, value) %>%
  mutate(parGrsWages = parGrsWages*100)

wls_waves <- p_waves %>% filter(Study == "WLS") %>% select(Used) %>% distinct()

wls_match_stp <- wls_match_stp %>%
  full_join(crossing(p_year = wls_waves$Used, year = "0")) %>%
  left_join(yrBrth) %>%
  group_by(recode, p_year, year) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) %>%
  filter(!is.na(value)) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
  group_by(p_year, SID, HHID, name, yearBrth) %>%
  summarize(value = fun_call(value, comp_rule[1])) %>%
  ungroup() %>%
  mutate(value = ifelse(value < 0, NA, value)) %>%
  spread(name, value)

# reverse code 
wls_match <- wls_match %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T),
           multiply = prod(x, na.rm = T))
  }

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, HHID, yearBrth, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

wls_match <- wls_match %>%
  filter(!(grepl("mom", name) | grepl("dad", name) | name == "parGrsWages")) %>%
  filter(year <= max(wls_waves$Used)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
         data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  wls_match %>%
    filter(year <= p_year  & comp_rule == rule) %>%
    group_by(SID, HHID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

wls_match <- crossing(
  p_year = wls_waves$Used, 
  comp_rule = unique(wls_match$comp_rule)
  ) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule),
         data = map2(comp_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-comp_rule) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>% full_join(wls_match_par) %>%
  full_join(wls_match_stp)
## # A tibble: 44,342 x 69
##     year SID       A     C   DEP     E    IQ    LOC     N   `NA`     O    OP    PA     SE   HHID yearBrth authonomy EnvMastery exercise grsWages
##    <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>    <dbl>     <dbl>      <dbl>    <dbl>    <dbl>
##  1  1992 9000…  4.2    5    1     4.20    NA   5.33  4.36  1.33    4.6    NA NA      4.67 1.21e6     1938      4.67       4.56      1.5   66000 
##  2  1992 9000…  6.4    6.4  0.55  6.8     NA   5.67  1.72  0       5.2    NA NA      5.44 1.20e6     1939      5.31       5.31      3.5   38500 
##  3  1992 9000…  6.2    4.8  0.7   3.8     NA   6     2.2   0.375   2.4    NA  6.75   3.86 1.20e6     1933     NA         NA         2.5      NA 
##  4  1992 9000… NA     NA   NA    NA       NA NaN    NA    NA      NA      NA NA    NaN    1.20e6     1938     NA         NA        NA     12500 
##  5  1992 9000…  4.4    5.8  0.25  4.6     NA   5.67  1.96  0       3.2    NA NA      6.09 1.21e6     1939      5.09       5.64      2.5   48800 
##  6  1992 9000… NA     NA   NA    NA        9  NA    NA    NA      NA      NA NA      5.5  1.21e6     1944     NA         NA        NA        NA 
##  7  1992 9000… NA     NA   NA    NA       NA   1    NA    NA      NA      NA NA      4.55 1.21e6     1938      4.27       4        NA     19451.
##  8  1992 9000…  5.4    6    1.05  4        4   3.5   2.92  0       4.4    NA  7      4.33 1.21e6     1941     NA         NA         4        NA 
##  9  1992 9000…  5.40   6.4  0.6   5.2     NA   5     3.16  0       5.2    NA NA      5.78 1.20e6     1939      5.22       5.44      3     65350 
## 10  1992 9000…  5.2    6.8  0.6   5.4     NA   6.33  3.4   0.333   5.8    NA NA      6.22 1.21e6     1939      5.33       6         3     44250 
## # … with 44,332 more rows, and 49 more variables: hospitalized <dbl>, PersGrowth <dbl>, Purpose <dbl>, hlthcare <dbl>, PhysFunc <dbl>, welfare <dbl>,
## #   ageMarried <dbl>, disability <dbl>, urban <dbl>, ageFrstDep <dbl>, alcohol <dbl>, anemia <dbl>, apprncRel10yrs <dbl>, backPain <dbl>, BMI <dbl>,
## #   depIntUnit <dbl>, depLngthUnit <dbl>, dizziness <dbl>, education <dbl>, employed <dbl>, everSmoked <dbl>, fatigue <dbl>, headaches <dbl>,
## #   height <dbl>, HHsize <dbl>, hlthRel10yrs <dbl>, hlthRelOthrs <dbl>, married <dbl>, numKids <dbl>, parDivorce <dbl>, religion <dbl>, satFam <dbl>,
## #   satJob <dbl>, sleepProb <dbl>, smokePacks <dbl>, smokes <dbl>, smokeYears <dbl>, SRhealth <dbl>, weight <dbl>, dadAlive <dbl>, dadEdu <dbl>,
## #   dadJob <dbl>, dadOccPrstg <dbl>, momAlive <dbl>, momEdu <dbl>, momOccPrstg <dbl>, parGrsWages <dbl>, gender <dbl>, physhlthevnt <dbl>

2.11.3 Personality Variables

wls_pers <- wls_codebook %>%
  filter(category == "pers") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  filter(name != "yearBrth") %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() %>%
  mutate(year = ifelse(sib == "sib", as.numeric(year) - 1, year)) %>%
  left_join(p_waves %>% filter(Study == "WLS") %>% select(name = p_item, Used)) %>%
  filter(year %in% Used) %>%
  unite(SID, SID, sib, sep = "")

recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# recode 
wls_pers <- wls_pers %>%
  select(-orig_itemname) %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data) 

# reverse code 
wls_pers <- wls_pers %>%
  mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), 
                        value, reverse.code(-1, value, mini = mini, maxi = maxi)))

# alpha's
wls_alpha <- wls_pers %>%
  select(name, itemname, year, SID, value, comp_rule) %>%
  group_by(name, year, SID, comp_rule, itemname) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  group_by(name, year, comp_rule) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(value = mean))),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

# create composites
comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

# create composites
wls_pers <- wls_pers %>%
  group_by(name, comp_rule, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "average", comp_rule),
         data = map2(data, comp_rule, comp_fun)) %>%
  unnest()
## # A tibble: 243,434 x 5
##    name  year  comp_rule SID        value
##    <chr> <chr> <chr>     <chr>      <dbl>
##  1 A     1992  average   900021grad  4.2 
##  2 A     1992  average   900026grad  6.4 
##  3 A     1992  average   900026sib   6.2 
##  4 A     1992  average   900042grad  4.4 
##  5 A     1992  average   900043sib   5.4 
##  6 A     1992  average   900054grad  5.40
##  7 A     1992  average   900069grad  5.2 
##  8 A     1992  average   900074sib   6.8 
##  9 A     1992  average   900075grad  5.4 
## 10 A     1992  average   900078grad  6.6 
## # … with 243,424 more rows

2.11.4 Outcome Variables

wls_out <- wls_codebook %>%
  filter(category == "out" & year != "0") %>%
  group_by(name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun))  %>%
  unnest(data) %>%
  distinct() %>%
  full_join(crossing(p_year = wls_waves$Used, name = unique((.)$name))) %>%
  unite(SID, SID, sib, sep = "")

old.names <- (wls_codebook %>% filter(category == "out" & year == "0"))$orig_itemname 
wls_out_stp <- wls %>%
  select(SID = idpub, HHID = familypub, one_of(old.names)) %>%
  gather(key = orig_itemname, value = value, -SID, -HHID, na.rm=T) %>%
  left_join(wls_codebook %>% select(name:year, orig_itemname, recode, comp_rule, sib)) %>%
  unite(SID, SID, sib, sep = "")

# recode
recode_fun <- function(rule, y, p_year, year){
  yearBrth <- y$yearBrth
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

wls_out <- wls_out %>%
  left_join(yrBrth) %>%
  select(-orig_itemname) %>%
  group_by(recode, year, p_year) %>%
  nest() %>%
  ungroup() %>%
  mutate(recode = str_replace_all(recode, "ifesle", "ifelse"),
         year = as.numeric(year),
         data = pmap(list(recode, data, p_year, year), recode_fun)) %>%
  unnest(data) %>%
  distinct()

wls_waves <- p_waves %>% filter(Study == "WLS") %>% select(Used) %>% distinct()

wls_out_stp <- wls_out_stp %>%
  full_join(crossing(p_year = wls_waves$Used, year = "0")) %>%
  group_by(recode, p_year, year) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = pmap(list(recode, data, p_year), recode_fun)) %>%
  unnest(data) %>%
  left_join(yrBrth)

# composite within years
wls_out <- wls_out %>%
  select(year, p_year:itemname, yearBrth) %>%
  distinct() %>%
  # filter(year <= max(gsoep_waves$Used)) %>%
  group_by(SID, name, HHID, year, yearBrth) %>%
  summarize(value = max(value, na.rm = T)) %>% 
  ungroup() %>%
  mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value))

# composite across years
comp_fun <- function(P_year){
  wls_out %>%
    mutate(group = ifelse(year > P_year, "future", "past")) %>%
    group_by(SID, HHID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) 
}

wls_out <- tibble(p_year = wls_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) %>%
  full_join(wls_out_stp %>% select(p_year,name, SID, HHID, value, yearBrth)) %>%
  filter(name != "chldbrth")
## # A tibble: 347,140 x 8
##    p_year SID           HHID name        past future value yearBrth
##     <dbl> <chr>        <dbl> <chr>      <dbl>  <dbl> <dbl>    <dbl>
##  1   1992 900018grad 1201627 divorced       0     NA     0       NA
##  2   1992 900018grad 1201627 married       NA     NA    NA       NA
##  3   1992 900018grad 1201627 unemployed    NA     NA    NA       NA
##  4   1992 900018grad 1201627 vlntred       NA     NA    NA       NA
##  5   1992 900018sib  1201627 divorced       0     NA     0       NA
##  6   1992 900018sib  1201627 married       NA     NA    NA       NA
##  7   1992 900018sib  1201627 unemployed    NA     NA    NA       NA
##  8   1992 900018sib  1201627 vlntred       NA     NA    NA       NA
##  9   1992 900021grad 1205260 crim          NA      0     0       NA
## 10   1992 900021grad 1205260 divorced       0      1     1       NA
## # … with 347,130 more rows

2.11.5 Covariates

## # A tibble: 44,342 x 19
##    SID   p_year   age education gender grsWages physhlthevnt SRhealth smokes alcohol exercise   BMI married numKids parDivorce PhysFunc religion parEdu
##    <chr>  <dbl> <dbl>     <dbl>  <dbl>    <dbl>        <dbl>    <dbl>  <dbl> <fct>      <dbl> <dbl>   <dbl>   <dbl>      <dbl> <fct>       <dbl>  <dbl>
##  1 9000…   1992    54         2      0   66000             1        4     NA 1            1.5    29       1       4          0 0               1      0
##  2 9000…   1992    53         2      0   38500             0        5     NA 1            3.5    27       1       3          0 0               1      2
##  3 9000…   1992    59        NA      0      NA            NA       NA     NA <NA>         2.5    24      NA      NA         NA <NA>           NA      2
##  4 9000…   1992    54        NA      0   12500            NA       NA     NA <NA>        NA      NA       1       2          0 <NA>            1      0
##  5 9000…   1992    53        NA      0   48800             0        5      0 0            2.5    27       1       3          0 0               1      2
##  6 9000…   1992    48        NA      0      NA            NA       NA     NA <NA>        NA      NA      NA      NA         NA <NA>           NA      2
##  7 9000…   1992    54        NA      0   19451.           NA       NA     NA 1           NA      NA       1       2          0 <NA>            1      0
##  8 9000…   1992    51        NA      0      NA            NA       NA     NA <NA>         4      25      NA      NA         NA <NA>           NA      0
##  9 9000…   1992    53        NA      0   65350             0        5      0 1            3      27       1       3          0 0               1      0
## 10 9000…   1992    53         2      0   44250             0        5      0 1            3      25       1       3          0 0               0      1
## # … with 44,332 more rows, and 1 more variable: parOccPrstg <dbl>

2.11.6 Imputation

# short helper functions to (1) identify and create factors, save col names of factors, remove columns with all missing values, and setup amelia
factor_fun <- function(x){if(is.numeric(x)){diff(range(x, na.rm = T)) %in% 1:2 & length(unique(x)) <= 4} else{F}}
not_all_id <- function(x) if(is.numeric(x)) sd(x, na.rm = T) != 0
mice_fun <- function(df){
  mice(df, m = 5, maxit=5,  printFlag=TRUE)
  }
wls_match_imp <- wls_match %>%
  rename(NegAff = `NA`) %>%
  group_by(SID, year) %>%
  mutate(parOccPrstg = max(cbind(momOccPrstg, dadOccPrstg), na.rm = T)) %>%
  select(-momOccPrstg, -dadOccPrstg, -HHID) %>%
  ungroup() %>%
  mutate_all(~ifelse(is.infinite(.) | is.nan(.), NA, .)) %>%
  filter(!is.na(year) & !is.na(SID)) %>%
  group_by(year) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% select_if(not_all_na)),
         data = map(data, ~(.) %>% mutate_if(factor_fun, as.factor)),
         imp = map(data, mice_fun))
beepr::beep(sound = 8)

wls_match_imp <- wls_match_imp %>%
  mutate(imp_df = map(imp, ~mice::complete(., "long")),
         imp_df = map(imp_df, ~(.) %>% 
    group_by(SID) %>%
    mutate(parEdu = max(cbind(momEdu, dadEdu), na.rm = T)) %>%
    select(-momEdu, -dadEdu) %>%
    ungroup()),
         imp_sca = map(imp_df, ~(.) %>% filter(.imp == 1) %>% select(SID,  one_of(unique(specifications$name)))))

wls_match_imp_long <- wls_match_imp %>%
  select(p_year = year, imp_df) %>%
  mutate(imp_df = map(imp_df, ~(.) %>% mutate_if(is.factor, ~as.numeric(as.character(.)))),
         imp_df = map(imp_df, function(x){if(any(colnames(x) == "relSat")){(x) %>% mutate(relSat = as.numeric(as.character(relSat)))} else{x}})) %>%
  unnest(imp_df)

wls_SCA_imp <- wls_match_imp %>%
  select(p_year = year, imp_sca) %>%
  unnest(imp_sca) %>%
  left_join(wls_SCA %>% select(p_year=year, SID,
        colnames(wls_SCA)[!colnames(wls_SCA) %in% colnames(.)])) %>%
  mutate(race = factor(0)) %>%
  mutate_at(vars(alcohol, PhysFunc), ~factor(ifelse(. > 0, 1, .)))