Chapter 2 Data Cleaning

In this section, we will clean the data for each study. Raw data cannot be shared directly, but for each study, we include instrucitons on how to access the data.

Mode <- function(x) {
  ux <- unique(x)
  ux <- ux[!is.na(ux)]
  ux[which.max(tabulate(match(x, ux)))]
}
pomp <- function(x) (x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))*10

2.1 Health and Retirement Study (HRS)

The Health and Retirement Study [HRS; (juster1995overview?)] is an ongoing longitudinal study of households in the United States. These data are available at https://hrs.isr.umich.edu by creating a free account.

Participants were recruited from more than 35,000 individuals from the financial households of individuals born between 1931 and 1941 in the US. Data have been collected biannually since 1992. The latest data release includes data up to 2016. On average, 10,000 individuals are sampled each wave More information on the HRS can be found at https://hrs.isr.umich.edu/documentation/survey-design, but, in short, the HRS is a nationally representative sample of adults over 50 in the US. It is critical to note that the HRS samples households of the original cohort and follows individuals and their spouses or partners until their death.

Sample size varies by year, ranging from approximately 7,500 (2014) to 15,500 (1992). (https://hrs.isr.umich.edu/sites/default/files/biblio/ResponseRates_2017.pdf). This provides 99% power to detect a zero-order correlation effect size of ~.04, two-tailed at alpha .05.

2.1.1 Load Data

hrs_read_fun <- function(year) {
  read_da <- function(da, dct, Year){
    print(paste(da, dct, year, sep = " "))
    data.file <- sprintf("%s/hrs/%s/%s", data_path, Year, da)
    # Set path to the dictionary file "*.DCT"
    dict.file <- sprintf("%s/hrs/%s/%s", data_path, Year, dct)
    # Read the dictionary file
    df.dict <- read.table(dict.file, skip = 1, fill = TRUE, stringsAsFactors = FALSE)
    # Set column names for dictionary dataframe
    colnames(df.dict) <- c("col.num","col.type","col.name","col.width","col.lbl")
    # Remove last row which only contains a closing }
    row <- which(df.dict$col.name == "HHID")
    df.dict <- df.dict[-nrow(df.dict),]
    if(row == 2){df.dict <- df.dict[-1,]}
    # Extract numeric value from column width field
    df.dict$col.width <- as.integer(sapply(df.dict$col.width, gsub, pattern = "[^0-9\\.]", replacement = ""))
    # Convert column types to format to be used with read_fwf function
    df.dict$col.type <- sapply(df.dict$col.type, function(x) ifelse(x %in% c("int","byte","long"), "i", ifelse(x == "float", "n", ifelse(x == "double", "d", "c"))))
    # Read the data file into a dataframe
    df <- read_fwf(file = data.file, fwf_widths(widths = df.dict$col.width, col_names = df.dict$col.name), col_types = paste(df.dict$col.type, collapse = ""))
    # Add column labels to headers
    attributes(df)$variable.labels <- df.dict$col.lbl
    old.names <- (hrs_codebook %>% filter(year == Year))$orig_itemname
    if(any(c("PN", "HHID") %in% colnames(df)) & any(old.names %in% colnames(df))){
    # if(any(c("PN", "HHID") %in% colnames(df))){
      df <- df %>%
      mutate(hhidpn = 1000*as.numeric(HHID) + as.numeric(PN)) %>%
      select(one_of(c("PN", "HHID")), one_of(old.names)) %>%
        distinct()
      # gather(key = item, value = value, -hhidpn)
    } else {df <- NA}
    return(df)
  }
    # Set path to the data file "*.DA"
  files <- list.files(sprintf("%s/hrs/%s", data_path, year))
  df2 <- tibble(
    da = files[grepl(".da",  files) | grepl(".DA", files)],
    dct = files[grepl(".dct", files) | grepl(".DCT", files)]
  ) %>%
    mutate(data = map2(da, dct, possibly(~read_da(.x, .y, year), NA_real_))) %>%
    filter(!is.na(data)) %>%
    select(-da, -dct) 
  if(nrow(df2) != 0){df2$data %>% reduce(full_join) %>% distinct()} else {NA}
}

hrs_codebook <- (codebook %>% filter(study == "HRS"))$codebook[[1]] %>% 
  mutate(orig_itemname = str_to_upper(orig_itemname)) %>%
  mutate_at(vars(orig_itemname, name, itemname), ~str_remove_all(., "[[:space:]]"))
hrs_codebook

## # A tibble: 702 × 17
##    study dataset category name  itemname  wave waveletter  year orig_itemname description scale reverse_code
##    <chr> <chr>   <chr>    <chr> <chr>    <dbl> <chr>      <dbl> <chr>         <chr>       <chr> <chr>       
##  1 hrs   Rand    cogniti… cogn… digitsB…     3 <NA>        1996 R3BWC20       BACKWARDS … "0.I… no          
##  2 hrs   Rand    cogniti… cogn… digitsB…     4 <NA>        1998 R4BWC20       BACKWARDS … "0.I… no          
##  3 hrs   Rand    cogniti… cogn… digitsB…     5 <NA>        2000 R5BWC20       BACKWARDS … "0.I… no          
##  4 hrs   Rand    cogniti… cogn… digitsB…     6 <NA>        2002 R6BWC20       BACKWARDS … "0.I… no          
##  5 hrs   Rand    cogniti… cogn… digitsB…     7 <NA>        2004 R7BWC20       BACKWARDS … "0.I… no          
##  6 hrs   Rand    cogniti… cogn… digitsB…     8 <NA>        2006 R8BWC20       BACKWARDS … "0.I… no          
##  7 hrs   Rand    cogniti… cogn… digitsB…     9 <NA>        2008 R9BWC20       BACKWARDS … "0.I… no          
##  8 hrs   Rand    cogniti… cogn… digitsB…    10 <NA>        2010 R10BWC20      BACKWARDS … "0.I… no          
##  9 hrs   Rand    cogniti… cogn… digitsB…    11 <NA>        2012 R11BWC20      BACKWARDS … "0.I… no          
## 10 hrs   Rand    cogniti… cogn… digitsB…    12 <NA>        2014 R12BWC20      BACKWARDS … "0.I… no          
## # ℹ 692 more rows
## # ℹ 5 more variables: recode <chr>, mini <dbl>, maxi <dbl>, comp_rule <chr>, long_rule <chr>

old.names <- unique(hrs_codebook$orig_itemname)
hrs.paq <- tibble(
  year = sprintf("%s/hrs", data_path) %>% list.files(., pattern = "^[0-9]")
  , data = map(year, hrs_read_fun)
  , names = map(data, colnames)
  ) %>%
  filter(!is.na(data))

old.names <- unique((hrs_codebook %>% filter(dataset == "Rand"))$orig_itemname)
hrs.rand <- sprintf("%s/hrs/randhrs1992_2016v1.sav", data_path) %>% 
  haven::read_sav(.) %>%
  haven::zap_labels(.) %>%
  select(SID = HHIDPN, one_of(old.names)) %>%
  gather(key = orig_itemname, value = value, -SID, na.rm = T)


hrs_long <- hrs.paq %>%
  mutate(data = map(data, ~(.) %>% 
      pivot_longer(cols = c(-HHID, -PN)
                   , names_to = "orig_itemname"
                   , values_to = "value"
                   , values_drop_na = TRUE))) %>%
  select(-names, -year) %>%
  unnest(data) %>%
  mutate(SID = 1000*as.numeric(HHID) + as.numeric(PN)) %>%
  select(-PN, -HHID) 

hrs_dem <- sprintf("%s/hrs/pdem_withvarnames.sas7bdat", data_path) %>%
  haven::read_sas(.) %>%
  select(SID = hhidpn, year = prediction_year, value = prob_dementia) %>%
  mutate(orig_itemname = "PROB_DEMENTIA")

hrs.subs <- unique(hrs_long$SID)[unique(hrs_long$SID) %in% unique(hrs.rand$SID)]
hrs_long <- hrs_long %>%
  bind_rows(hrs.rand %>% select(orig_itemname, value, SID)) %>%
  filter(SID %in% hrs.subs)

save(hrs.rand, hrs.paq, file = sprintf("%s/data/clean/hrs_raw.RData", local_path))
rm(list = c("hrs.paq", "hrs.rand"))

2.1.2 Recoding & Reverse Scoring

hrs_waves <- p_waves %>% filter(Study == "HRS") %>% select(Used) %>% distinct()

# join data with recoding info 
hrs_recode <- hrs_codebook %>%
  filter(category %in% c("pers", "outcome", "covariates", "cognition") & orig_itemname != "prob_dementia") %>%
  select(category, name, itemname, wave, year, orig_itemname, reverse_code:long_rule) %>%
  group_by(category, name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(hrs_long))) 

hrs_recode <- hrs_dem %>%
  left_join(
    hrs_codebook %>% 
      select(category, name, itemname, orig_itemname, reverse_code:long_rule)
    ) %>%
  group_by(category, name) %>%
  nest() %>%
  ungroup() %>%
  bind_rows(hrs_recode)

# recode 
recode_fun <- function(rule, y, year, p_year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

hrs_recode <- hrs_recode %>% 
  mutate(data = map(data, ~(.) %>% 
    mutate(year = mapvalues(year, seq(2006, 2016, 2), rep(c(2006, 2010, 2014), each = 2)),
           p_year = 2006) %>%
    group_by(recode, year, p_year) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, year, p_year), recode_fun)) %>%
    unnest(data) %>%
    mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))


# reverse code 
hrs_recode <- hrs_recode %>%
  mutate(data = map(data, ~(.) %>%
    mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

2.1.3 Covariates

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, name, year, p_year, long_rule) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

# composite WITHIN years 
hrs_cov <- hrs_recode %>%
  filter(category == "covariates") %>%
  select(-category) %>%
  unnest(data) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(d, rule, p_year){
  d %>%
    filter(year <= p_year) %>%
    group_by(SID, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    distinct()
}

# composite ACROSS years
hrs_cov <- hrs_cov %>%
  mutate(long_rule = ifelse(is.na(long_rule), "skip", long_rule)) %>%
  group_by(p_year, long_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(data, long_rule, p_year), comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-long_rule) %>%
  spread(name, value) %>%
  filter(!is.na(SID))

hrs_cov

## # A tibble: 28,726 × 19
##    p_year      SID alcohol   BMI cancer diabetes education exercise gender heartProb height married  race
##     <dbl>    <dbl>   <dbl> <dbl>  <dbl>    <dbl>     <dbl>    <dbl>  <dbl>     <dbl>  <dbl>   <dbl> <dbl>
##  1   2006     3010       1  28.8      0        0        12     2.02      0         1   165.       1     0
##  2   2006     3020       1  28.8      1        0        16     1.77      1         0   165.       1     0
##  3   2006 10001010       0  23.5      0        0        12     1.88      0         0   183.       0     0
##  4   2006 10003030       0  28.5      0        0        16     1.25      1         1   157.       1     0
##  5   2006 10004010       1  27.0      1        0        16     1.75      0         0   185.       1     0
##  6   2006 10004040       1  27.0      0        0        12     2         1         0   165.       1     0
##  7   2006 10013010       0  28.2      0        1        12     1.57      0         1   177.       1     0
##  8   2006 10013040       1  24.5      0        0        13     1.5       1         0   160.       1     0
##  9   2006 10038010       1  23.3      0        0        16     1.75      0         1   177.       1     0
## 10   2006 10038040       1  23.3      0        0        16     1.78      1         0   170.       1     0
## # ℹ 28,716 more rows
## # ℹ 6 more variables: respDis <dbl>, smokes <dbl>, SRhealth <dbl>, stroke <dbl>, weight <dbl>,
## #   yearBrth <dbl>

2.1.4 Personality Variables

hrs_pers <- hrs_recode %>%
  filter(category == "pers") %>%
  select(-category) %>% 
  unnest(data) %>%
  filter(year == "2006" & !is.na(value)) %>%
  group_by(SID, p_year, year, name, itemname, comp_rule) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  ungroup()

# alpha's
hrs_alpha <- hrs_pers %>%
  filter(!is.na(value)) %>%
  select(name, itemname, year, SID, value) %>%
  group_by(name, year) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% distinct() %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(mean))),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

# create composites
hrs_pers <- hrs_pers %>%
  group_by(name, comp_rule, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, comp_fun)) %>%
  unnest(data) %>%
  select(-comp_rule)

hrs_pers

## # A tibble: 116,915 × 4
##     year name       SID value
##    <dbl> <chr>    <dbl> <dbl>
##  1  2006 A         3010   3  
##  2  2006 A         3020   3  
##  3  2006 A     10001010   4  
##  4  2006 A     10003030   3.4
##  5  2006 A     10004010   3.4
##  6  2006 A     10004040   3.6
##  7  2006 A     10013010   2  
##  8  2006 A     10013040   2.6
##  9  2006 A     10038010   3.4
## 10  2006 A     10038040   2.6
## # ℹ 116,905 more rows

2.1.5 Cognition Variables

hrs_cog <- hrs_recode %>%
  filter(category == "cognition") %>%
  select(-category) %>% 
  unnest(data) %>%
  filter(year == p_year) %>%
  filter(!is.na(value)) %>%
  group_by(name, itemname, year) %>%
  mutate(value = pomp(value)) %>%
  group_by(SID, name) %>%
  summarize(value = mean(value)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name", values_from = "value")

hrs_cog

## # A tibble: 16,778 × 2
##         SID cognition
##       <dbl>     <dbl>
##  1     3010      7.5 
##  2     3020      8.08
##  3 10001010      8.75
##  4 10003030      6   
##  5 10004010      7.92
##  6 10004040      8.83
##  7 10013010      7.25
##  8 10013040      8.67
##  9 10038010      8.58
## 10 10038040      8.25
## # ℹ 16,768 more rows

2.1.6 Outcome Variables

# composite within years 
# compositing within years
hrs_out <- hrs_recode %>%
  filter(category == "outcome") %>%
  unnest(data) %>%
  group_by(SID, name, year, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value),
         group = ifelse(year > p_year, "future", "past")) %>%
  filter(!is.na(value)) %>%
  group_by(SID, p_year, year, name, group) %>%
  mutate(value = ifelse(value < .5, 0, 1)
         , o_year = max(year[!is.na(value)])) %>%
  group_by(SID, p_year, name, group, o_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value), NA, value)) %>%
  pivot_wider(names_from = group, values_from = value) %>%
  group_by(SID, name, o_year) %>%
  mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                 ifelse(past == 0 & is.na(future), past, 
                 ifelse(past == 1, NA, NA)))) %>%
  group_by(SID, p_year, name) %>% 
  mutate(o_year = max(o_year)) %>% 
  group_by(SID, p_year, o_year, name) %>% 
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value), NA, value)) %>%
  ungroup()

hrs_out

## # A tibble: 27,187 × 5
##         SID p_year o_year name     value
##       <dbl>  <dbl>  <dbl> <chr>    <dbl>
##  1     3010   2006   2010 dementia     1
##  2     3020   2006   2014 dementia     0
##  3 10001010   2006   2014 dementia     0
##  4 10003030   2006   2014 dementia     0
##  5 10004010   2006   2010 dementia     0
##  6 10004040   2006   2014 dementia     0
##  7 10013010   2006   2014 dementia     1
##  8 10013040   2006   2014 dementia     0
##  9 10038010   2006   2014 dementia     0
## 10 10038040   2006   2014 dementia     0
## # ℹ 27,177 more rows

2.1.7 Combine Data

hrs_combined <- hrs_pers %>% 
  rename(Trait = name, p_value = value, p_year = year) %>%
  full_join(hrs_out %>% select(SID, Outcome = name, o_value = value, o_year)) %>%
  full_join(hrs_cov) %>%
  left_join(
    hrs_out %>%
      filter(name == "dementia") %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  full_join(hrs_cog) %>%
  filter(!is.na(p_value) & !is.na(o_value)) %>%
  mutate(age = p_year - yearBrth)

hrs_combined

## # A tibble: 112,097 × 27
##    p_year Trait      SID p_value Outcome  o_value o_year alcohol   BMI cancer diabetes education exercise
##     <dbl> <chr>    <dbl>   <dbl> <chr>      <dbl>  <dbl>   <dbl> <dbl>  <dbl>    <dbl>     <dbl>    <dbl>
##  1   2006 A         3010     3   dementia       1   2010       1  28.8      0        0        12     2.02
##  2   2006 A         3020     3   dementia       0   2014       1  28.8      1        0        16     1.77
##  3   2006 A     10001010     4   dementia       0   2014       0  23.5      0        0        12     1.88
##  4   2006 A     10003030     3.4 dementia       0   2014       0  28.5      0        0        16     1.25
##  5   2006 A     10004010     3.4 dementia       0   2010       1  27.0      1        0        16     1.75
##  6   2006 A     10004040     3.6 dementia       0   2014       1  27.0      0        0        12     2   
##  7   2006 A     10013010     2   dementia       1   2014       0  28.2      0        1        12     1.57
##  8   2006 A     10013040     2.6 dementia       0   2014       1  24.5      0        0        13     1.5 
##  9   2006 A     10038010     3.4 dementia       0   2014       1  23.3      0        0        16     1.75
## 10   2006 A     10038040     2.6 dementia       0   2014       1  23.3      0        0        16     1.78
## # ℹ 112,087 more rows
## # ℹ 14 more variables: gender <dbl>, heartProb <dbl>, height <dbl>, married <dbl>, race <dbl>,
## #   respDis <dbl>, smokes <dbl>, SRhealth <dbl>, stroke <dbl>, weight <dbl>, yearBrth <dbl>,
## #   dementia <dbl>, cognition <dbl>, age <dbl>

save(hrs_cov, hrs_alpha, hrs_pers, hrs_out, hrs_combined, hrs_cog,
     file = sprintf("%s/data/clean/hrs_cleaned.RData", local_path))

rm(list =ls()[grepl("hrs", ls())])

2.2 RUSH Memory and and Aging Project (RUSH-MAP)

The RUSH Memory and Aging Project (RUSH-MAP) is an ongoing longitudinal study that began in 1997 (a2012overview?). These data are available, through application from https://www.radc.rush.edu/requests.htm.

Participants who were 65 and older were recruited from retirement communities and subsidized senior housing facilities throughout Chicagoland and northeastern Illinois beginning in 1997. Data are collected annually, and all participants are organ donors. Additional participants are recuited each year. Additional information and documentation on the data can be found at https://www.radc.rush.edu/docs/var/variables.htm.

Sample sizes vary by year, ranging from 52 (1997) to 2205 participants including 884 deceased participants with autopsy data (2019, 2020). This provides 99% power to detect a zero-order correlation effect size of ~.10, two-tailed at alpha .05.

2.2.1 Load Data

(map_codebook <- (codebook %>% filter(study == "RADC-MAP"))$codebook[[1]] %>%
  mutate(orig_itemname = str_to_lower(orig_itemname)))

## # A tibble: 44 × 15
##    study    dataset category  name  itemname year  orig_itemname description scale reverse_code recode  mini
##    <chr>    <lgl>   <chr>     <chr> <chr>    <chr> <chr>         <chr>       <chr> <chr>        <chr>  <dbl>
##  1 RADC-MAP NA      cognition cogn… freeRec… long… cts_wlii      word list …  <NA> no           ifels…    NA
##  2 RADC-MAP NA      cognition cogn… cuedRec… long… cts_wliii     word list …  <NA> no           ifels…    NA
##  3 RADC-MAP NA      cognition cogn… digitsF… long… cts_df        digits bac…  <NA> no           ifels…    NA
##  4 RADC-MAP NA      cognition cogn… digitsB… long… cts_db        digits for…  <NA> no           ifels…    NA
##  5 RADC-MAP NA      cognition cogn… catFlue… long… cts_catflu    category f…  <NA> no           ifels…    NA
##  6 RADC-MAP NA      cognition cogn… bosNami… long… cts_bname     Boston nam…  <NA> no           ifels…    NA
##  7 RADC-MAP NA      cognition cogn… progMat  long… cts_pmat      progressiv…  <NA> no           ifels…    NA
##  8 RADC-MAP NA      cognition cogn… digitSy… long… cts_sdmt      symbol dig…  <NA> no           ifels…    NA
##  9 RADC-MAP NA      covariat… age   ageBase… base… age_bl        The age at…  <NA> no           ifels…    NA
## 10 RADC-MAP NA      covariat… alco… alcohol  base… alcohol_g_bl  Grams of a… "\r\… no           ifels…    NA
## # ℹ 34 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>

map <- sprintf("%s/rush-radc/dataset_1033_long_03-24-2021.xlsx", data_path) %>% read_excel() %>%
  full_join(sprintf("%s/rush-radc/dataset_1033_basic_03-24-2021.xlsx", data_path) %>% read_excel()) %>%
  full_join(sprintf("%s/rush-radc/dataset_1034_long_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
  full_join(sprintf("%s/rush-radc/dataset_1034_basic_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
  filter(study == "MAP")

2.2.2 Recoding & Reverse Scoring

rename_fun <- function(cb, var){
  print(var)
  old.names <- unique((map_codebook %>% filter(name == var))$orig_itemname)
  df <- map %>% 
    select(SID = projid, wave = fu_year, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -SID, -wave, na.rm = T) %>%
    left_join(cb %>% select(itemname, orig_itemname, reverse_code:long_rule))
}

# join data with recoding info 
map_recode <- map_codebook %>%
  select(category, name, itemname, year, orig_itemname, reverse_code:long_rule) %>%
  filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
  group_by(category, name) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun))

# recode 
recode_fun <- function(rule, y, year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

map_recode <- map_recode %>% 
  mutate(data = map(data, ~(.) %>% 
    mutate(wave = as.numeric(wave)) %>%
    group_by(recode, wave) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, wave), recode_fun)) %>%
    unnest(data) %>%
    mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))

# reverse code 
map_recode <- map_recode %>%
  mutate(data = map(data, ~(.) %>%
    mutate(value = as.numeric(value), 
           value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
}

2.2.3 Covariates

# composite WITHIN years 
map_cov <- map_recode %>%
  filter(category == "covariates") %>%
  select(-category) %>%
  mutate(data = map(data, ~(.) %>% 
      filter(!is.na(value)) %>%
      mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
      group_by(comp_rule, SID, wave, long_rule) %>%
      nest() %>%
      ungroup() %>%
      mutate(data = map2(data, comp_rule, ~fun_call((.x)$value, .y))) %>%
      unnest(data) %>%
      filter(wave == 0)
      ))

comp_fun <- function(d, rule){
  d %>%
    group_by(SID, name) %>%
    summarize(value = fun_call(data, rule)) %>%
    ungroup() %>%
    distinct()
}

# composite ACROSS years
map_cov <- map_cov %>%
  unnest(data) %>%
  mutate(long_rule = ifelse(is.na(long_rule), "skip", long_rule)) %>%
  group_by(long_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(data, long_rule), comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-long_rule) %>%
  spread(name, value) %>%
  filter(!is.na(SID))

map_cov

## # A tibble: 2,192 × 16
##    SID        age alcohol   BMI cancer diabetes education exercise gender heartProb married  mmse parkinsons
##    <chr>    <dbl>   <dbl> <dbl>  <dbl>    <dbl>     <dbl>    <dbl>  <dbl>     <dbl>   <dbl> <dbl>      <dbl>
##  1 00009121  80.0       1  26.8      0        0        12        1      0         0       1    29          0
##  2 00033027  81.0       0  32.6      0        1        14        0      0         0       1    29          0
##  3 00045071  87.5       1  28.2      0        1        16        1      1         1       1    18          0
##  4 00130005  89.8       0  27.8      0        0        15        1      0         1       1    29          0
##  5 00204228  65.2       0  36.6      0        1         8        1      1         0       1    27          0
##  6 00228190  73.5       1  23.0      1        0        22        1      0         0       1    29          0
##  7 00246264  90.0       0  24.0      0        0        16        1      0         0       1    27          0
##  8 00285563  84.7       0  27.0      0        0        12        1      0         0       1    28          0
##  9 00402800  78.7       0  17.2      0        0        16        1      0         0       1    17          0
## 10 00482428  81.4       1  NA        1        0        12        1      0         0       1    30          0
## # ℹ 2,182 more rows
## # ℹ 3 more variables: race <dbl>, smokes <dbl>, stroke <dbl>

2.2.4 Personality Variables

map_pers <- map_recode %>%
  filter(category == "pers") %>%
  select(-category) %>% 
  unnest(data) %>% 
  distinct() %>%
  group_by(SID, name) %>%
  filter(wave == min(wave)) %>%
  ungroup() %>%
  select(SID, name, wave, value)

map_pers

## # A tibble: 9,115 × 4
##    SID      name   wave value
##    <chr>    <chr> <dbl> <dbl>
##  1 00009121 C         0    35
##  2 00045071 C         0    38
##  3 00130005 C         0    31
##  4 00246264 C         0    33
##  5 00402800 C         0    24
##  6 00582981 C         0    37
##  7 00617643 C         0    34
##  8 00696418 C         0    27
##  9 00701662 C         0    31
## 10 00709354 C         0    40
## # ℹ 9,105 more rows

2.2.5 Outcome Variables

map_out_waves <- map_recode %>%
  filter(category == "outcome") %>%
  unnest(data) %>%
  group_by(SID, name) %>%
  summarize(o_year = max(wave)) %>%
  ungroup()

map_out <- map_recode %>%
  filter(category == "outcome") %>%
  select(-category) %>%
  unnest(data) 

map_dem <- map_out %>% 
  filter(name == "dementia") %>%
  group_by(SID, name, wave) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value),
         group = ifelse(wave > 0, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) %>%
  ungroup()

comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

# create composites
map_out <- map_out %>%
  filter(wave > 0 & name != "dementia") %>%
  group_by(name, long_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, long_rule, comp_fun)) %>%
  unnest(data) %>%
  select(-long_rule) %>%
  full_join(map_dem) %>%
  left_join(map_out_waves)

map_out

## # A tibble: 11,128 × 6
##    name        SID      value future  past o_year
##    <chr>       <chr>    <dbl>  <dbl> <dbl>  <dbl>
##  1 ageDementia 00045071  88.5     NA    NA      1
##  2 ageDementia 00285563  90.5     NA    NA     13
##  3 ageDementia 00668310  88.6     NA    NA      5
##  4 ageDementia 01243685  89.7     NA    NA      3
##  5 ageDementia 01797756  86.4     NA    NA     11
##  6 ageDementia 02108769  90.9     NA    NA      8
##  7 ageDementia 03227207  97.8     NA    NA     17
##  8 ageDementia 03380931  85.7     NA    NA      7
##  9 ageDementia 03806878  92.4     NA    NA      5
## 10 ageDementia 04330337 101.      NA    NA     15
## # ℹ 11,118 more rows

2.2.6 Cognition Variables

# composite within years
map_cog <- map_recode %>%
  filter(category == "cognition") %>%
  select(-category) %>%
  unnest(data) %>%
  filter(wave == 0) %>%
  filter(!is.na(value)) %>%
  group_by(name, itemname, wave) %>%
  mutate(value = pomp(value)) %>%
  group_by(name, SID) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name"
              , values_from = "value")

map_cog

## # A tibble: 2,189 × 2
##    SID      cognition
##    <chr>        <dbl>
##  1 00009121      7.55
##  2 00033027      6.17
##  3 00045071      4.84
##  4 00130005      6.59
##  5 00204228      5.81
##  6 00228190      7.38
##  7 00246264      6.16
##  8 00285563      5.79
##  9 00402800      4.14
## 10 00482428      6.58
## # ℹ 2,179 more rows

2.2.7 Combine Data

map_combined <- map_pers %>% 
  rename(Trait = name, p_value = value, p_year = wave) %>%
  full_join(
    map_out %>% 
      select(Outcome = name, SID, o_value = value, o_year)
    ) %>% 
  full_join(map_cov) %>%
  left_join(
    map_out %>%
      filter(name == "dementia") %>%
      mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
      select(-future, -past) %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  full_join(map_cog) %>%
  filter(!is.na(p_value) & !is.na(o_value))

map_combined

## # A tibble: 42,999 × 24
##    SID    Trait p_year p_value Outcome o_value o_year   age alcohol   BMI cancer diabetes education exercise
##    <chr>  <chr>  <dbl>   <dbl> <chr>     <dbl>  <dbl> <dbl>   <dbl> <dbl>  <dbl>    <dbl>     <dbl>    <dbl>
##  1 00009… C          0      35 dement…     0       10  80.0       1  26.8      0        0        12        1
##  2 00045… C          0      38 ageDem…    88.5      1  87.5       1  28.2      0        1        16        1
##  3 00045… C          0      38 dement…     1        1  87.5       1  28.2      0        1        16        1
##  4 00130… C          0      31 dement…     0        4  89.8       0  27.8      0        0        15        1
##  5 00246… C          0      33 angiop…     2        8  90.0       0  24.0      0        0        16        1
##  6 00246… C          0      33 arteri…     2        8  90.0       0  24.0      0        0        16        1
##  7 00246… C          0      33 athero…     1        8  90.0       0  24.0      0        0        16        1
##  8 00246… C          0      33 braak       3        8  90.0       0  24.0      0        0        16        1
##  9 00246… C          0      33 cerad       2        8  90.0       0  24.0      0        0        16        1
## 10 00246… C          0      33 hipScl…     0        8  90.0       0  24.0      0        0        16        1
## # ℹ 42,989 more rows
## # ℹ 10 more variables: gender <dbl>, heartProb <dbl>, married <dbl>, mmse <dbl>, parkinsons <dbl>,
## #   race <dbl>, smokes <dbl>, stroke <dbl>, dementia <dbl>, cognition <dbl>

save(map_cov, map_pers, map_out, map_combined, map_cog,
     file = sprintf("%s/data/clean/radc-map_cleaned.RData", load_path))

rm(list =ls()[grepl("map", ls())])

2.3 RUSH Religious Orders Study (ROS)

The RUSH Religious Orders Study (ROS) is an ongoing longitudinal study that began in 1994 (a2012overview?). These data are available, through application from https://www.radc.rush.edu/requests.htm.

Older (65 and above) Catholic nuns, priests, and brothers with no prior dementia diagnosis and who agreed to annual evaluations and eventual organ donation were recruited from more than 40 groups across the United States. Additional participants are recuited each year. Additional information and documentation on the data can be found at https://www.radc.rush.edu/docs/var/variables.htm.

Sample sizes vary bt year from 353 participants (1994) to 1487 participants, including 797 deceased participants with autopsy data (2019, 2020). This provides 99% power to detect a zero-order correlation effect size of ~.11, two-tailed at alpha .05.

(ros_codebook <- (codebook %>% filter(study == "ROS"))$codebook[[1]] %>%
  mutate(orig_itemname = str_to_lower(orig_itemname)))

## # A tibble: 42 × 15
##    study dataset category   name    itemname year  orig_itemname description scale reverse_code recode  mini
##    <chr> <lgl>   <chr>      <chr>   <chr>    <chr> <chr>         <chr>       <chr> <chr>        <chr>  <dbl>
##  1 ROS   NA      cognition  cognit… freeRec… long… cts_wlii      word list …  <NA> no           ifels…    NA
##  2 ROS   NA      cognition  cognit… cuedRec… long… cts_wliii     word list …  <NA> no           ifels…    NA
##  3 ROS   NA      cognition  cognit… digitsF… long… cts_df        digits bac…  <NA> no           ifels…    NA
##  4 ROS   NA      cognition  cognit… digitsB… long… cts_db        digits for…  <NA> no           ifels…    NA
##  5 ROS   NA      cognition  cognit… catFlue… long… cts_catflu    category f…  <NA> no           ifels…    NA
##  6 ROS   NA      cognition  cognit… bosNami… long… cts_bname     Boston nam…  <NA> no           ifels…    NA
##  7 ROS   NA      cognition  cognit… progMat  long… cts_pmat      progressiv…  <NA> no           ifels…    NA
##  8 ROS   NA      cognition  cognit… digitSy… long… cts_sdmt      symbol dig…  <NA> no           ifels…    NA
##  9 ROS   NA      covariates age     ageBase… base… age_bl        The age at…  <NA> no           ifels…    NA
## 10 ROS   NA      covariates alcohol alcohol  base… alcohol_g_bl  Grams of a… "\r\… no           ifels…    NA
## # ℹ 32 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>

ros <- sprintf("%s/rush-radc/dataset_1033_long_03-24-2021.xlsx", data_path) %>% read_excel() %>%
  full_join(sprintf("%s/rush-radc/dataset_1033_basic_03-24-2021.xlsx", data_path) %>% read_excel()) %>%
  full_join(sprintf("%s/rush-radc/dataset_1034_long_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
  full_join(sprintf("%s/rush-radc/dataset_1034_basic_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
  filter(study == "ROS")

2.3.1 Recoding & Reverse Scoring

rename_fun <- function(cb, var){
  print(var)
  old.names <- unique((ros_codebook %>% filter(name == var))$orig_itemname)
  df <- ros %>% 
    select(SID = projid, wave = fu_year, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -SID, -wave, na.rm = T) %>%
    left_join(cb %>% select(itemname, orig_itemname, reverse_code:long_rule))
}

# join data with recoding info 
ros_recode <- ros_codebook %>%
  select(category, name, itemname, year, orig_itemname, reverse_code:long_rule) %>%
  filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
  group_by(category, name) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun))

# recode 
recode_fun <- function(rule, y, year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

ros_recode <- ros_recode %>% 
  mutate(data = map(data, ~(.) %>% 
    mutate(wave = as.numeric(wave)) %>%
    group_by(recode, wave) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, wave), recode_fun)) %>%
    unnest(data) %>%
    mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))

# reverse code 
ros_recode <- ros_recode %>%
  mutate(data = map(data, ~(.) %>%
    mutate(value = as.numeric(value), 
           value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
}

2.3.2 Covariates

# composite WITHIN years 
ros_cov <- ros_recode %>%
  filter(category == "covariates") %>%
  select(-category) %>%
  mutate(data = map(data, ~(.) %>% 
      filter(!is.na(value)) %>%
      mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
      group_by(comp_rule, SID, wave, long_rule) %>%
      nest() %>%
      ungroup() %>%
      mutate(data = map2(data, comp_rule, ~fun_call((.x)$value, .y))) %>%
      unnest(data) %>%
      filter(wave == 0)
      ))

comp_fun <- function(d, rule){
  d %>%
    group_by(SID, name) %>%
    summarize(value = fun_call(data, rule)) %>%
    ungroup() %>%
    distinct()
}

# composite ACROSS years
ros_cov <- ros_cov %>%
  unnest(data) %>%
  mutate(long_rule = ifelse(is.na(long_rule), "skip", long_rule)) %>%
  group_by(long_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(data, long_rule), comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-long_rule) %>%
  spread(name, value) %>%
  filter(!is.na(SID))

ros_cov

## # A tibble: 1,485 × 14
##    SID       age alcohol   BMI cancer diabetes education exercise gender heartProb  mmse  race smokes stroke
##    <chr>   <dbl>   <dbl> <dbl>  <dbl>    <dbl>     <dbl>    <dbl>  <dbl>     <dbl> <dbl> <dbl>  <dbl>  <dbl>
##  1 000210…  80.0       0  19.9      0        0        22        1      0         1    18     0      0      0
##  2 003377…  82.2       0  42.8      0        0        22        1      1         0    27     0      0      0
##  3 003811…  70.3       0  29.7      0        0        20        0      0         0    30     0      0     NA
##  4 004702…  72.6       1  20.0      1        0        18        1      0         0    30     0      0      0
##  5 007567…  87.3       0  33.2      0        1        20        0      1         0    23     1      1      0
##  6 009850…  78.6       0  23.7      1        0        21        1      0         0    28     0      0      0
##  7 012114…  85.2       0  25.1      0        0        12        0      1         0    24     0      1      0
##  8 012370…  72.0       0  18.8      0        0        16        1      0         0    29     0      0      0
##  9 016795…  65.2       1  36.4      0        0        22        1      0         0    30     0      0      0
## 10 021057…  85.2       1  21.5      0        0        18        1      0         0    29     0      0      0
## # ℹ 1,475 more rows

2.3.3 Personality Variables

ros_pers <- ros_recode %>%
  filter(category == "pers") %>%
  select(-category) %>% 
  unnest(data) %>% 
  distinct() %>%
  group_by(SID, name) %>%
  filter(wave == min(wave)) %>%
  ungroup() %>%
  select(SID, name, wave, value)

ros_pers

## # A tibble: 7,569 × 4
##    SID      name   wave value
##    <chr>    <chr> <dbl> <dbl>
##  1 00021073 A         0    32
##  2 00337708 A         0    30
##  3 00381112 A         0    36
##  4 00470212 A         0    41
##  5 00756793 A         0    27
##  6 00985084 A         0    33
##  7 01211411 A         0    27
##  8 01237015 A         0    34
##  9 01679543 A         0    43
## 10 02105734 A         0    36
## # ℹ 7,559 more rows

2.3.4 Outcome Variables

ros_out_waves <- ros_recode %>%
  filter(category == "outcome") %>%
  unnest(data) %>%
  group_by(SID, name) %>%
  summarize(o_year = max(wave)) %>%
  ungroup()

ros_out <- ros_recode %>%
  filter(category == "outcome") %>%
  select(-category) %>%
  unnest(data) 

ros_dem <- ros_out %>% 
  filter(name == "dementia") %>%
  group_by(SID, name, wave) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value),
         group = ifelse(wave > 0, "future", "past")) %>%
    group_by(SID, name, group) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) %>%
    pivot_wider(names_from = group, values_from = value) %>%
    group_by(SID, name) %>%
    mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                   ifelse(past == 0 & is.na(future), past, 
                   ifelse(past == 1, NA, NA)))) %>%
  ungroup()

comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

# create composites
ros_out <- ros_out %>%
  filter(wave > 0 & name != "dementia") %>%
  group_by(name, long_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, long_rule, comp_fun)) %>%
  unnest(data) %>%
  select(-long_rule) %>%
  full_join(ros_dem) %>%
  left_join(ros_out_waves)

ros_out

## # A tibble: 9,581 × 6
##    name        SID      value future  past o_year
##    <chr>       <chr>    <dbl>  <dbl> <dbl>  <dbl>
##  1 ageDementia 02495739  92.5     NA    NA     13
##  2 ageDementia 07299965  80.0     NA    NA     13
##  3 ageDementia 07660182  83.7     NA    NA      4
##  4 ageDementia 10100150  84.5     NA    NA     12
##  5 ageDementia 10100286  80.7     NA    NA     18
##  6 ageDementia 10101039  90.4     NA    NA      8
##  7 ageDementia 10101589 107.      NA    NA      6
##  8 ageDementia 10101741  92.4     NA    NA     12
##  9 ageDementia 10116694  82.2     NA    NA      5
## 10 ageDementia 10200901  96.2     NA    NA     23
## # ℹ 9,571 more rows

2.3.5 Cognition Variables

# composite within years
ros_cog <- ros_recode %>%
  filter(category == "cognition") %>%
  select(-category) %>%
  unnest(data) %>%
  filter(wave == 0) %>%
  filter(!is.na(value)) %>%
  group_by(name, itemname, wave) %>%
  mutate(value = pomp(value)) %>%
  group_by(name, SID) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name"
              , values_from = "value")

ros_cog

## # A tibble: 1,485 × 2
##    SID      cognition
##    <chr>        <dbl>
##  1 00021073      4.19
##  2 00337708      5.89
##  3 00381112      7.02
##  4 00470212      6.68
##  5 00756793      4.35
##  6 00985084      6.83
##  7 01211411      3.83
##  8 01237015      5.64
##  9 01679543      7.23
## 10 02105734      6.64
## # ℹ 1,475 more rows

2.3.6 Combine Data

ros_combined <- ros_pers %>% 
  rename(Trait = name, p_value = value, p_year = wave) %>%
  full_join(
    ros_out %>% 
      select(Outcome = name, SID, o_value = value, o_year)
    ) %>%
  full_join(ros_cov) %>%
  left_join(
    ros_out %>%
      filter(name == "dementia") %>%
      mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
      select(-future, -past) %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  full_join(ros_cog) %>%
  filter(!is.na(p_value) & !is.na(o_value))

ros_combined

## # A tibble: 48,074 × 22
##    SID    Trait p_year p_value Outcome o_value o_year   age alcohol   BMI cancer diabetes education exercise
##    <chr>  <chr>  <dbl>   <dbl> <chr>     <dbl>  <dbl> <dbl>   <dbl> <dbl>  <dbl>    <dbl>     <dbl>    <dbl>
##  1 00021… A          0      32 angiop…       3      2  80.0       0  19.9      0        0        22        1
##  2 00021… A          0      32 arteri…       0      2  80.0       0  19.9      0        0        22        1
##  3 00021… A          0      32 athero…       2      2  80.0       0  19.9      0        0        22        1
##  4 00021… A          0      32 braak         6      2  80.0       0  19.9      0        0        22        1
##  5 00021… A          0      32 cerad         1      2  80.0       0  19.9      0        0        22        1
##  6 00021… A          0      32 hipScl…       0      2  80.0       0  19.9      0        0        22        1
##  7 00021… A          0      32 lewyBo…       0      2  80.0       0  19.9      0        0        22        1
##  8 00021… A          0      32 vsclrI…       0      2  80.0       0  19.9      0        0        22        1
##  9 00021… A          0      32 vsclrM…       1      2  80.0       0  19.9      0        0        22        1
## 10 00021… A          0      32 tdp43         1      2  80.0       0  19.9      0        0        22        1
## # ℹ 48,064 more rows
## # ℹ 8 more variables: gender <dbl>, heartProb <dbl>, mmse <dbl>, race <dbl>, smokes <dbl>, stroke <dbl>,
## #   dementia <dbl>, cognition <dbl>

save(ros_cov, ros_pers, ros_out, ros_combined, ros_cog,
     file = sprintf("%s/data/clean/ros_cleaned.RData", load_path))

rm(list =ls()[grepl("ros", ls())])

2.4 Swedish Adoption Twin Study of Aging (SATSA)

The Swedish Adoption Twin Study of Aging (SATSA) is a longitudinal study of twin pairs from the Swedish Twin Registry that began in 1984. Data are available through the ICPSR database at https://www.icpsr.umich.edu/web/ICPSR/studies/3843.

All twin-pairs on the Swedish Twin Registry who were separated at an early age were invited to be a part of the study in 1984. A control sample of twins reared together were also included. Additional waves of all participants were collected in 1987, 1990, 1993, 2004, 2007, 2010, 2012, and 2014. More information, including codebooks, scales, and variable search functions can be found at https://www.maelstrom-research.org/mica/individual-study/satsa/#.

Sample sizes vary by wave, ranging from 2018 participants at baseline (1984) to 379 participants (IPT7). Given that the target measures were collected at baseline, this provides 99% power to detect a zero-order correlation effect size of ~.10, two-tailed at alpha .05.

2.4.1 Load Data

satsa_read_fun <- function(x){
  prob_vars <- c("FHEART", "FPARKIN", "FSTROKE")
  y <- sprintf("%s/satsa/%s", data_path, x) %>% haven::read_sav(.) %>% 
    select(SID = TWINNR, one_of(old.names)) %>%
    as_tibble() %>%
    haven::zap_labels(.)
  if(any(prob_vars %in% colnames(y))){
    y <- y %>% mutate_at(vars(one_of(prob_vars)), ~as.numeric(as.character(.)))
    }
  return(y)
}

satsa_codebook <- (codebook %>% filter(study == "SATSA"))$codebook[[1]] %>%
  mutate_at(vars(orig_itemname), str_to_upper)
satsa_codebook

## # A tibble: 840 × 17
##    study dataset    category  name      itemname wave_letter  year item_stem orig_itemname description scale
##    <chr> <chr>      <chr>     <chr>     <chr>    <chr>       <dbl> <chr>     <chr>         <chr>       <chr>
##  1 satsa SATSA_IPT1 cognition cognition blockDe… <NA>         1985 <NA>      IBLOC_R1      Block Desi… inte…
##  2 satsa SATSA_IPT2 cognition cognition blockDe… <NA>         1989 <NA>      IBLOC_R2      Block Desi… inte…
##  3 satsa SATSA_IPT3 cognition cognition blockDe… <NA>         1992 <NA>      IBLOC_R3      Block Desi… inte…
##  4 satsa SATSA_IPT4 cognition cognition blockDe… <NA>         1995 <NA>      IBLOC_R4      Block Desi… inte…
##  5 satsa SATSA_IPT5 cognition cognition blockDe… <NA>         1999 <NA>      IBLOC_R5      Block Desi… inte…
##  6 satsa SATSA_IPT6 cognition cognition blockDe… <NA>         2002 <NA>      IBLOC_R6      Block Desi… inte…
##  7 satsa SATSA_IPT7 cognition cognition blockDe… <NA>         2005 <NA>      IBLOC_R7      Block Desi… inte…
##  8 satsa SATSA_IPT1 cognition cognition digitSp… <NA>         1985 <NA>      IDGSP_R1      Digit Span… inte…
##  9 satsa SATSA_IPT2 cognition cognition digitSp… <NA>         1989 <NA>      IDGSP_R2      Digit Span… inte…
## 10 satsa SATSA_IPT3 cognition cognition digitSp… <NA>         1992 <NA>      IDGSP_R3      Digit Span… inte…
## # ℹ 830 more rows
## # ℹ 6 more variables: reverse_code <chr>, recode <chr>, mini <dbl>, maxi <dbl>, comp_rule <chr>,
## #   long_rule <chr>

old.names <- unique(satsa_codebook$orig_itemname) %>% str_to_upper
datasets <- sprintf("%s/satsa", data_path) %>% list.files(., pattern = ".sav")
satsa <- tibble(datasets = datasets) %>%
  mutate(data = map(datasets, satsa_read_fun),
         ncol = map_dbl(data, ncol)) %>%
  filter(ncol != 0)

satsa <- reduce(satsa$data, full_join)
satsa <- satsa %>%
  mutate_if(is.factor, ~as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", .))) 
satsa_long <- satsa %>% 
  pivot_longer(
    names_to = "orig_itemname"
    , values_to = "value"
    , cols = -SID
    # , values_drop_na = T
    )
save(satsa, file = sprintf("%s/data/clean/satsa_raw.RData", load_path))
rm(satsa)

2.4.2 Recoding & Reverse Scoring

satsa_waves <- p_waves %>% filter(Study == "SATSA") %>% select(Used) %>% distinct()

# join data with recoding info 
satsa_recode <- satsa_codebook %>%
  select(category, name, itemname, year, orig_itemname, reverse_code:long_rule) %>%
  filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
  group_by(category, name) %>%
  nest() %>% 
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(satsa_long)))

# recode 
recode_fun <- function(rule, y, year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

satsa_recode <- satsa_recode %>% 
  mutate(data = map(data, ~(.) %>% 
    group_by(recode, year) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
    unnest(data) %>%
    mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value)) %>%
    filter(!is.na(value))))


# reverse code 
satsa_recode <- satsa_recode %>%
  mutate(data = map(data, ~(.) %>%
    mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

2.4.3 Covariates

load(sprintf("%s/data/clean/satsa_cleaned.RData", local_path))

satsa_cov <- satsa_recode %>% filter(category == "covariates")
# bring in year or birth for cleaning
yrBrth <- satsa_cov %>% 
  filter(name == "yearBrth") %>% 
  unnest(data) %>%
  group_by(SID) %>%
  summarize(yearBrth = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth))

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, name, year, long_rule) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

satsa_waves <- p_waves %>% filter(Study == "SATSA") %>% select(Used) %>% distinct()

satsa_cov <- satsa_cov %>%
  unnest(data) %>%
  filter(year <= max(satsa_waves$Used) + 1) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(d, rule){
  d %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

satsa_cov <- satsa_cov %>%
  group_by(name, long_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, long_rule, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-long_rule) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max))  %>%
  mutate(BMI = weight/((height/100)^2))

satsa_cov

## # A tibble: 3,840 × 18
##      SID smokes alcohol cancer diabetes heartProb married parkinsons respProb stroke gender education  mmse
##    <dbl>  <dbl>   <dbl>  <dbl>    <dbl>     <dbl>   <dbl>      <dbl>    <dbl>  <dbl>  <dbl>     <dbl> <dbl>
##  1   181      0      NA     NA       NA        NA      NA         NA       NA     NA      0        NA    NA
##  2   182      0      NA     NA       NA        NA      NA         NA       NA     NA      0        NA    NA
##  3  1101      0      NA     NA       NA        NA      NA         NA       NA     NA      0        NA    NA
##  4  1102      0      NA     NA       NA        NA      NA         NA       NA     NA      0        NA    NA
##  5  1121      0      NA     NA       NA        NA      NA         NA       NA     NA      1        NA    NA
##  6  1122      0      NA     NA       NA        NA      NA         NA       NA     NA      1        NA    NA
##  7  1151      0      NA     NA       NA        NA      NA         NA       NA     NA      1        NA    NA
##  8  1152      0      NA     NA       NA        NA      NA         NA       NA     NA      1        NA    NA
##  9  1211      0      NA     NA       NA        NA      NA         NA       NA     NA      1        NA    NA
## 10  1212      0      NA     NA       NA        NA      NA         NA       NA     NA      1        NA    NA
## # ℹ 3,830 more rows
## # ℹ 5 more variables: yearBrth <dbl>, SRhealth <dbl>, height <dbl>, weight <dbl>, BMI <dbl>

2.4.4 Personality Variables

satsa_pers <- satsa_recode %>%
  filter(category == "pers") %>%
  unnest(data) %>%
  left_join(p_waves %>% filter(Study == "SATSA") %>% select(name = p_item, Used)) %>%
  filter(year %in% Used & !is.na(value)) %>%
  distinct() 

# alpha's
satsa_alpha <- satsa_pers %>%
  select(name, itemname, year, SID, value) %>%
  group_by(name, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% spread(itemname, value)),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

# create composites
satsa_pers <- satsa_pers %>%
  group_by(SID, name, year) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  ungroup()

satsa_pers

## # A tibble: 14,109 × 4
##      SID name   year value
##    <dbl> <chr> <dbl> <dbl>
##  1  1292 A      1984  4   
##  2  1292 C      1984  4   
##  3  1292 E      1984  3.67
##  4  1292 N      1984  4.33
##  5  1292 O      1984  3.5 
##  6  1292 SWL    1984  2.54
##  7  1701 C      1984  5   
##  8  1701 E      1984  3.89
##  9  1701 N      1984  4   
## 10  1701 O      1984  1   
## # ℹ 14,099 more rows

2.4.5 Outcome Variables

satsa_out <- satsa_recode %>% 
  filter(category == "outcome") %>%
  unnest(data) %>%
  distinct() %>%
  full_join(crossing(p_year = satsa_waves$Used, name = unique((.)$name)))

satsa_out_waves <- satsa_out %>%
  group_by(SID, name) %>%
  summarize(o_year = max(year[!is.na(value)])) %>%
  ungroup()

satsa_waves <- p_waves %>% filter(Study == "SATSA") %>% select(Used) %>% distinct()

satsa_out <- satsa_out %>%
  filter(year > p_year) %>%
  group_by(SID, name, year, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>% 
  ungroup() %>%
  mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value)) %>%
  group_by(SID, p_year, name) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value), NA, value)) %>%
  left_join(satsa_out_waves)

satsa_out

## # A tibble: 3,840 × 5
##      SID p_year name     value o_year
##    <dbl>  <dbl> <chr>    <dbl>  <dbl>
##  1   181   1984 dementia     0   2005
##  2   182   1984 dementia     0   2005
##  3  1101   1984 dementia     0   2005
##  4  1102   1984 dementia     0   2005
##  5  1121   1984 dementia     0   2005
##  6  1122   1984 dementia     0   2005
##  7  1151   1984 dementia     0   2005
##  8  1152   1984 dementia     0   2005
##  9  1211   1984 dementia     0   2005
## 10  1212   1984 dementia     0   2005
## # ℹ 3,830 more rows

2.4.6 Cognition Variables

# composite within years
satsa_cog <- satsa_recode %>%
  filter(category == "cognition") %>%
  select(-category) %>%
  unnest(data) %>%
  filter(year == 1985) %>%
  filter(!is.na(value)) %>%
  group_by(name, itemname, year) %>%
  mutate(value = pomp(value)) %>%
  group_by(name, SID) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name"
              , values_from = "value")

2.4.7 Combine Data

satsa_combined <- satsa_pers %>% 
  rename(Trait = name, p_value = value, p_year = year) %>%
  full_join(satsa_out %>% rename(Outcome = name, o_value = value)) %>%
  full_join(satsa_cov) %>%
  left_join(
    satsa_out %>%
      filter(name == "dementia") %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  full_join(satsa_cog) %>%
  filter(!is.na(p_value) & !is.na(o_value)) %>%
  mutate(age = p_year - yearBrth
         , o_year = 2005)

satsa_combined

## # A tibble: 14,109 × 27
##      SID Trait p_year p_value Outcome  o_value o_year smokes alcohol cancer diabetes heartProb married
##    <dbl> <chr>  <dbl>   <dbl> <chr>      <dbl>  <dbl>  <dbl>   <dbl>  <dbl>    <dbl>     <dbl>   <dbl>
##  1  1292 A       1984    4    dementia       0   2005      0       0      1        0         1       1
##  2  1292 C       1984    4    dementia       0   2005      0       0      1        0         1       1
##  3  1292 E       1984    3.67 dementia       0   2005      0       0      1        0         1       1
##  4  1292 N       1984    4.33 dementia       0   2005      0       0      1        0         1       1
##  5  1292 O       1984    3.5  dementia       0   2005      0       0      1        0         1       1
##  6  1292 SWL     1984    2.54 dementia       0   2005      0       0      1        0         1       1
##  7  1701 C       1984    5    dementia       0   2005      0       0      0        0         1      NA
##  8  1701 E       1984    3.89 dementia       0   2005      0       0      0        0         1      NA
##  9  1701 N       1984    4    dementia       0   2005      0       0      0        0         1      NA
## 10  1701 O       1984    1    dementia       0   2005      0       0      0        0         1      NA
## # ℹ 14,099 more rows
## # ℹ 14 more variables: parkinsons <dbl>, respProb <dbl>, stroke <dbl>, gender <dbl>, education <dbl>,
## #   mmse <dbl>, yearBrth <dbl>, SRhealth <dbl>, height <dbl>, weight <dbl>, BMI <dbl>, dementia <dbl>,
## #   cognition <dbl>, age <dbl>

save(satsa_cov, satsa_alpha, satsa_pers, satsa_out, satsa_combined, satsa_cog,
     file = sprintf("%s/data/clean/satsa_cleaned.RData", load_path))

rm(list =ls()[grepl("satsa", ls())])

2.5 ADRC Memory and Aging Project (ADRC-MAP)

The Alzheimer Disease Research Center Memory and Aging Project (ADRC-MAP) is an ongoing longitudinal study of memory and Alzheimer’s Disease that began in 1979. Data are available on a study-by-study basis through application from https://knightadrc.wustl.edu/Research/ResourceRequest.htm.

Participants were recruited from the Charles and Joanne F. Knight Alzheimer’s Disease Research Center at Washington University in St. Louis as part of an ongoing study of disease progression. The current study uses a subset of approximately 1200 of these participants who completed personality surveys as part of a substudy (see Duchek et al., 2019). More information on the study can be found at https://knightadrc.wustl.edu/Research/PDFs/Clinical%20Core%20list%20of%20measures.pdf.

Sample sizes vary over time, from approximately 400 to 1200. This provides 99% power to detect a zero-order correlation effect size of ~.15, two-tailed at alpha .05.

2.5.1 Load Data

(adrc_codebook <- (codebook %>% filter(study == "ADRC"))$codebook[[1]])

## # A tibble: 106 × 15
##    study dataset category  name     itemname year  orig_itemname description scale reverse_code recode  mini
##    <chr> <lgl>   <chr>     <chr>    <chr>    <chr> <chr>         <chr>       <chr> <chr>        <chr>  <dbl>
##  1 ADRC  NA      cognition cogniti… waisBlc… long… PSY021        "WAIS BLOC… Rang… no           ifels…    NA
##  2 ADRC  NA      cognition cogniti… waisInfo long… PSY019        "WAIS INFO… Rang… no           ifels…    NA
##  3 ADRC  NA      cognition cogniti… waisRDi… long… DIGSYM        "WAIS-R DI… Rang… no           ifels…    NA
##  4 ADRC  NA      cognition cogniti… digFrwd… long… DIGFORCT      "NUMBER SP… Rang… no           ifels…    NA
##  5 ADRC  NA      cognition cogniti… digBckw… long… DIGBACCT      "NUMBER SP… Rang… no           ifels…    NA
##  6 ADRC  NA      cognition cogniti… trailMa… long… TRAILA        "The score… Rang… no           ifels…    NA
##  7 ADRC  NA      cognition cogniti… trailMa… long… TRAILB        "The score… Rang… no           ifels…    NA
##  8 ADRC  NA      cognition cogniti… cuedRec… long… SRT1C         "Free & Cu… Rang… no           ifels…    NA
##  9 ADRC  NA      cognition cogniti… cuedRec… long… SRT2C         "Free & Cu… Rang… no           ifels…    NA
## 10 ADRC  NA      cognition cogniti… cuedRec… long… SRT3C         "Free & Cu… Rang… no           ifels…    NA
## # ℹ 96 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>

adrc_read_fun <- function(file){
  print(file)
  d <- sprintf("%s/adrc-map/%s", data_path, file) %>% read_excel(.) %>%
    select(SID = id, one_of(c("TESTDATE", old.names)), contains("NEO Date"))
  if("TESTDATE" %in% colnames(d)){ if(any(class(d$TESTDATE) != "numeric")){d$TESTDATE <- lubridate::year(d$TESTDATE)}}
  d
}

old.names <- unique(adrc_codebook$orig_itemname)
adrc <- tibble(file = list.files(sprintf("%s/adrc-map", data_path), pattern = ".xlsx")) %>%
  filter(!grepl("NEO", file)) %>%
  mutate(data = map(file, adrc_read_fun)) %>%
  filter(map_dbl(data, ncol) > 1)

waves <- adrc %>%
  mutate(data = map(data, ~(.) %>% select(SID, one_of("TESTDATE")))) %>%
  select(-file) %>%
  unnest(data) %>%
  filter(complete.cases(.)) %>%
  # mutate(year = lubridate::year(TESTDATE)) %>%
  distinct() %>%
  arrange(SID, TESTDATE) %>%
  group_by(SID) %>%
  mutate(frstyear = min(TESTDATE)) %>%
  ungroup() %>%
  mutate(year = TESTDATE, 
         wave = year - frstyear + 1)

adrc_long <- reduce(adrc$data, full_join) %>%
  distinct() %>%
  select(SID, year = TESTDATE, everything()) %>%
  mutate(BIRTH = lubridate::year(BIRTH)) %>%
  pivot_longer(cols = c(-SID, -year)
               , names_to = "orig_itemname"
               , values_to = "value")

# load personality data separately
old.names <- (adrc_codebook %>% filter(category == "pers"))$orig_itemname
adrc_pers <- sprintf("%s/adrc-map/NEO - Raw Scores_FINAL.xlsx", data_path) %>% 
  read_xlsx() %>%
  select(SID = id, date = `NEO Date 1`, one_of(old.names)) %>%
  mutate(date = ifelse(grepl("[//]", date), as.numeric(as.Date(date, format = "%m/%d/%Y")), date),
         date = as.Date(as.numeric(date), origin="1899-12-30")) %>%
  filter(!is.na(date))
# get waves for participants
adrc_pers_waves <- adrc_pers %>%
  select(SID, p_year = date) %>%
  distinct() %>%
  mutate(p_year = lubridate::year(p_year))

2.5.2 Recode & Reverse-Scoring

adrc_recode <- adrc_codebook %>%
  filter(category %in%  c("covariates", "outcome", "cognition") & !is.na(orig_itemname)) %>%
  select(category, name, itemname, orig_itemname, reverse_code:long_rule) %>%
  right_join(adrc_long) %>%
  left_join(adrc_pers_waves)

# recode 

recode_fun <- function(rule, y, year){
  # print(rule)
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

adrc_recode <- adrc_recode %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

# reverse code 
adrc_recode <- adrc_recode %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi))) %>%
  group_by(category, name) %>%
  nest() %>% 
  ungroup()

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

2.5.3 Covariates

load(sprintf("%s/data/clean/adrc_cleaned.RData", local_path))

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, name, year, p_year, long_rule) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

adrc_cov <- adrc_recode %>%
  filter(category == "covariates") %>%
  unnest(data) %>%
  filter(year <= p_year) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data) %>%
  select(-comp_rule)

comp_fun <- function(d, rule){
  d %>%
    group_by(SID, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

adrc_cov <- adrc_cov %>%
  filter(!is.na(value) & !is.na(name)) %>%
  group_by(long_rule) %>%
  nest() %>%
  ungroup() %>% 
  mutate(data = map2(data, long_rule, comp_fun)) %>%
  unnest(data) %>% 
  select(-long_rule) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>%
  right_join(adrc_pers_waves) %>%
  mutate(age = p_year - yearBrth)

adrc_cov

## # A tibble: 1,162 × 15
##      SID gender yearBrth alcohol cancer diabetes education heartProb married  race smokes stroke weight
##    <dbl>  <dbl>    <dbl>   <dbl>  <dbl>    <dbl>     <dbl>     <dbl>   <dbl> <dbl>  <dbl>  <dbl>  <dbl>
##  1  1070      0     1918       1      1        0        17         1       1     0      0      0    187
##  2  1088      0     1916       1      1        0        16         1       1     0      0      0    217
##  3  1380      1     1913       1      1        0        16         1       0     0      0      0    129
##  4 10018      0     1929       0      0        0        14         1       1     0      1      0    156
##  5 10034      0     1930       0      1        0        16         0       1     0      0      0    193
##  6 10037      1     1923       1      1        0        13         1       1     0      1      0    179
##  7 10038      0     1927       1      0        0        18         0       1     0      0      0    205
##  8 10045      1     1931       1      1        0        16         0       1     0      1      0    160
##  9 10054      1     1930       0      1        0        13         1       1     0      1      0    173
## 10 10064      0     1928       1      1        0        20         1       1     0      1      0    199
## # ℹ 1,152 more rows
## # ℹ 2 more variables: p_year <dbl>, age <dbl>

2.5.4 Personality Variables

# bring in codebook info
adrc_pers <- adrc_pers %>%
  pivot_longer(`1S1`:`1S60`
               , names_to = "orig_itemname"
               , values_to = "value"
               , values_drop_na = T) %>%
  left_join(adrc_codebook %>% select(name:orig_itemname, reverse_code:maxi))

recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

# recode 
adrc_pers <- adrc_pers %>%
  group_by(recode) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(recode, data, recode_fun)) %>%
  unnest(data) 

# reverse code 
adrc_pers <- adrc_pers %>%
  mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))

# alpha's
adrc_alpha <- adrc_pers %>%
  select(name, itemname, date, SID, value) %>%
  group_by(name) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID, -date)), NA_real_)))

# create composites
adrc_pers <- adrc_pers %>%
  group_by(SID, name, date) %>%
  summarize(value = mean(value, na.rm = T))  %>%
  ungroup() %>%
  left_join(adrc_pers_waves)

adrc_pers

## # A tibble: 5,730 × 5
##      SID name  date       value p_year
##    <dbl> <chr> <date>     <dbl>  <dbl>
##  1  1070 A     2003-10-28  4.08   2003
##  2  1070 C     2003-10-28  4.83   2003
##  3  1070 E     2003-10-28  4.08   2003
##  4  1070 N     2003-10-28  1.08   2003
##  5  1070 O     2003-10-28  3.17   2003
##  6  1088 A     2005-04-18  4.5    2005
##  7  1088 C     2005-04-18  3.42   2005
##  8  1088 E     2005-04-18  3.08   2005
##  9  1088 N     2005-04-18  1.25   2005
## 10  1088 O     2005-04-18  3.08   2005
## # ℹ 5,720 more rows

2.5.5 Outcome Variables

adrc_out <- adrc_recode %>%
  filter(category == "outcome") %>%
  unnest(data)

adrc_out_waves <- adrc_out %>%
  select(year, SID, name) %>%
  group_by(SID, name) %>%
  summarize(o_year = max(year)) %>% 
  ungroup()

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, name, year, long_rule, p_year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

adrc_out <- adrc_out %>%
  filter(!is.na(value)) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(d, rule){
  d %>%
    group_by(SID, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

adrc_out <- adrc_out %>%
  filter(name == "dementia") %>%
  mutate(group = ifelse(year <= p_year, "past", "future")) %>%
  group_by(SID, name, group, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value), NA, value)) %>%
  pivot_wider(names_from = group, values_from = value) %>%
  group_by(SID, name) %>%
  mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                 ifelse(past == 0 & is.na(future), past, 
                 ifelse(past == 1, NA, NA)))) %>%
  ungroup() %>%
  full_join(
    adrc_out %>%
      filter(name != "dementia" & !is.na(value) & year >= p_year) %>% 
      group_by(long_rule, p_year) %>%
      nest() %>%
      ungroup() %>% 
      mutate(data = map2(data, long_rule, comp_fun)) %>%
      select(-long_rule) %>%
      unnest(data) %>%
      mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
  ) %>%
  left_join(adrc_out_waves)

adrc_out

## # A tibble: 2,575 × 8
##      SID name     p_year future  past  `NA` value o_year
##    <dbl> <chr>     <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>
##  1  1070 dementia   2003      1     0    NA     1   2019
##  2  1088 dementia   2005      1     1    NA    NA   2015
##  3  1380 dementia   2003      1     0    NA     1   2013
##  4 10018 dementia   2005      0     0    NA     0   2018
##  5 10034 dementia   2004      0     0    NA     0   2020
##  6 10037 dementia   2006      1     0    NA     1   2019
##  7 10038 dementia   2003      1     0    NA     1   2016
##  8 10045 dementia   2003      1     1    NA    NA   2018
##  9 10054 dementia   2008      0     0    NA     0   2012
## 10 10064 dementia   2003      1     0    NA     1   2018
## # ℹ 2,565 more rows

2.5.6 Cognition Variables

# composite within years
adrc_cog <- adrc_recode %>%
  filter(category == "cognition") %>%
  select(-category) %>%
  unnest(data) %>%
  filter(!is.na(p_year)) %>%
  filter(!is.na(value)) %>%
  group_by(name, itemname, year) %>%
  mutate(value = pomp(value)) %>%
  group_by(SID, name, itemname) %>%
  filter(year %in% (p_year - 1):(p_year + 1)) %>%
  group_by(name, SID) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name"
              , values_from = "value")

adrc_cog

## # A tibble: 1,023 × 2
##      SID cognition
##    <dbl>     <dbl>
##  1  1070      4.62
##  2  1088      3.40
##  3  1380      2.36
##  4 10018      5.59
##  5 10034      5.30
##  6 10037      4.64
##  7 10038      6.60
##  8 10045      8.52
##  9 10054      2.41
## 10 10064      7.36
## # ℹ 1,013 more rows

2.5.7 Combine Data

adrc_combined <- adrc_pers %>% 
  select(SID, Trait = name, p_value = value, p_year) %>%
  full_join(
    adrc_out %>% 
      select(SID, o_year, Outcome = name, o_value = value)
    ) %>%
  filter(!is.na(o_value) & !is.na(p_value)) %>%
  distinct() %>%
  left_join(adrc_cov) %>%
  left_join(
    adrc_out %>%
      filter(name == "dementia") %>%
      mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
      select(-future, -past) %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  left_join(adrc_cog) %>%
  left_join(adrc_out_waves %>% select(SID, Outcome = name, o_year))

adrc_combined

## # A tibble: 11,145 × 23
##      SID Trait p_value p_year o_year Outcome       o_value gender yearBrth alcohol cancer diabetes education
##    <dbl> <chr>   <dbl>  <dbl>  <dbl> <chr>           <dbl>  <dbl>    <dbl>   <dbl>  <dbl>    <dbl>     <dbl>
##  1  1070 A        4.08   2003   2019 dementia            1      0     1918       1      1        0        17
##  2  1070 C        4.83   2003   2019 dementia            1      0     1918       1      1        0        17
##  3  1070 E        4.08   2003   2019 dementia            1      0     1918       1      1        0        17
##  4  1070 N        1.08   2003   2019 dementia            1      0     1918       1      1        0        17
##  5  1070 O        3.17   2003   2019 dementia            1      0     1918       1      1        0        17
##  6  1088 A        4.5    2005   2015 angiopathy          0      0     1916       1      1        0        16
##  7  1088 A        4.5    2005   2015 arterioloscl…       1      0     1916       1      1        0        16
##  8  1088 A        4.5    2005   2015 atherosclero…       2      0     1916       1      1        0        16
##  9  1088 A        4.5    2005   2015 braak               3      0     1916       1      1        0        16
## 10  1088 A        4.5    2005   2015 hipSclerosis        0      0     1916       1      1        0        16
## # ℹ 11,135 more rows
## # ℹ 10 more variables: heartProb <dbl>, married <dbl>, race <dbl>, smokes <dbl>, stroke <dbl>,
## #   weight <dbl>, age <dbl>, `NA` <dbl>, dementia <dbl>, cognition <dbl>

save(adrc_cov, adrc_alpha, adrc_pers, adrc_out, adrc_combined, adrc_cog,
     file = sprintf("%s/data/clean/adrc_cleaned.RData", load_path))

rm(list =ls()[grepl("adrc", ls())])

2.6 Einstein Aging Study

The Einstein Aging Study (EAS) is an ongoing longitudinal study of the aging brain. The EAS began in 1980 and has enrolled more than 2,600 participants since then. Data are available through application at http://www.einstein.yu.edu/departments/neurology/clinical-research-program/eas/data-sharing.aspx.

Since 1993, the EAS has systematically recruited a representative aging sample in the Bronx, New York, As of 2017, 2,600 participants were enrolled in the study. As of 2010, approximately 200 of the enrolled participants had autopsy data. More information on the study can be found at http://www.einstein.yu.edu/departments/neurology/clinical-research-program/EAS/.

Sample sizes vary over time, with ranges across waves not publically available. However, we suspect approximately 2,000 participants to have basic personality and dementia diagnoses, with between 150 and 300 participants having full autopsy data collected after personality was introduced into the study. This yields 99% power to detect a zero-order correlation effect size of .10 and .24, respectively, two-tailed at alpha .05.

2.6.1 Load Data

(eas_codebook <- (codebook %>% filter(study == "EAS"))$codebook[[1]])

## # A tibble: 78 × 15
##    study dataset  category   name   itemname year  orig_itemname description scale reverse_code recode  mini
##    <chr> <chr>    <chr>      <chr>  <chr>    <chr> <chr>         <chr>       <chr> <chr>        <chr>  <dbl>
##  1 EAS   behavior cognition  cogni… waisBlk… long… Blockraw      "WAIS III:… "int… <NA>         ifels…    NA
##  2 EAS   behavior cognition  cogni… trailma… long… Tr-A1         "Other Wor… "int… <NA>         ifels…    NA
##  3 EAS   behavior cognition  cogni… trailma… long… Tr-B1         "Other Wor… "int… <NA>         ifels…    NA
##  4 EAS   behavior cognition  cogni… digitSym long… Symraw        "Digit Sym… "int… <NA>         ifels…    NA
##  5 EAS   behavior cognition  cogni… recall   long… TotRecall     "total Rec… "int… <NA>         ifels…    NA
##  6 EAS   behavior cognition  cogni… SPN      long… Spnraw        "Digit Spa… "int… <NA>         ifels…    NA
##  7 EAS   behavior cognition  cogni… CAT      long… CAT            <NA>       "int… <NA>         ifels…    NA
##  8 EAS   behavior cognition  cogni… FAS      long… FAS            <NA>       "int… <NA>         ifels…    NA
##  9 EAS   behavior covariates alcoh… alcohol… base… SAB262        "What was … "1=N… no           ifels…    NA
## 10 EAS   behavior covariates alcoh… alcohol… base… SAB263        "What was … "1=N… no           ifels…    NA
## # ℹ 68 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>

old.names1 <- unique((eas_codebook %>% filter(dataset == "behavior"))$orig_itemname)
old.names2 <- unique((eas_codebook %>% filter(dataset == "neuropath"))$orig_itemname)
old.names3 <- unique((eas_codebook %>% filter(dataset == "activity"))$orig_itemname)
eas <- sprintf("%s/eas/Behavior_with_Master_Data_2021_10_25.xlsx", data_path) %>% 
  read_excel(., sheet = 1) %>% 
  mutate(year = lubridate::year(BehaviorDate)) %>%
  select(SID = Id, wave = Wave, year, one_of(old.names1)) %>%
  full_join(
    sprintf("%s/eas/Neuropath_and_Behavior_Data_2021_09_26 (3).xlsx", data_path) %>% 
      read_excel(.) %>% 
      select(SID = `Clin#`, year = DOD, wave = Wave, one_of(old.names2)) %>%
      mutate(comb_dx = ifelse(c("VaD", "AD", "AGD") %in% Dx1 | c("VaD", "AD", "AGD") %in% Dx2 | 
                                c("VaD", "AD", "AGD") %in% Dx3 | grepl("VaD", `OTHER Dx`) | 
                                grepl("AD", `OTHER Dx`) | grepl("AGD", `OTHER Dx`), 1, 0)
             , year = lubridate::year(year)) %>%
      select(-(Dx1:`OTHER Dx`))
  ) %>% 
  full_join(
    sprintf("%s/eas/Northwestern_supp_Physical_Activities_2021_10_25-1.xlsx", data_path) %>% 
      read_excel(.) %>% 
      select(SID = Id, wave = Wave, one_of(old.names3))
  ) %>% 
  mutate(Gender = ifelse(Gender == "F", 1, ifelse(Gender == "M", 0, NA)))

eas_waves <- eas %>% select(SID, wave, year) %>% distinct()

eas_long <- eas %>%
  pivot_longer(values_to = "value"
               , names_to = "orig_itemname"
               , cols = c(-SID, -wave, -year))

2.6.2 Recode & Reverse-Scoring

eas_recode <- eas_codebook %>%
  filter(category %in%  c("covariates", "outcome", "cognition", "pers") & !is.na(orig_itemname)) %>%
  select(category, name, itemname, orig_itemname, reverse_code:long_rule) %>%
  right_join(eas_long) 

# recode 

recode_fun <- function(rule, y, year){
  print(rule)
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

eas_recode <- eas_recode %>%
  group_by(recode, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
  unnest(data)

# reverse code 
eas_recode <- eas_recode %>%
  mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi))) %>%
  group_by(category, name) %>%
  nest() %>% 
  ungroup()

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
}

eas_p_waves <- eas_recode %>% 
  filter(category == "pers") %>% 
  unnest(data) %>%
  filter(!is.na(value)) %>%
  group_by(SID) %>%
  filter(year == min(year)) %>%
  ungroup() %>%
  select(SID, p_year = year) %>%
  distinct()

2.6.3 Covariates

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, name, year, p_year, long_rule) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

eas_cov <- eas_recode %>%
  filter(category == "covariates") %>%
  unnest(data) %>%
  left_join(eas_p_waves) %>%
  filter(year <= p_year) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data) %>%
  select(-comp_rule)

comp_fun <- function(d, rule){
  d %>%
    group_by(SID, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

eas_cov <- eas_cov %>%
  filter(!is.na(value) & !is.na(name)) %>%
  group_by(long_rule) %>%
  nest() %>%
  ungroup() %>% 
  mutate(data = map2(data, long_rule, comp_fun)) %>%
  unnest(data) %>% 
  select(-long_rule) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>%
  right_join(eas_p_waves)

eas_cov

## # A tibble: 799 × 17
##      SID alcohol cancer diabetes heartProb hypertension married  race smokes stroke exercise   BMI   age
##    <dbl>   <dbl>  <dbl>    <dbl>     <dbl>        <dbl>   <dbl> <dbl>  <dbl>  <dbl>    <dbl> <dbl> <dbl>
##  1  6075       0      0        0         0            0       1     0      0      0       NA  21.1  73.4
##  2  8024       1      0        0         0            0       1     0      1      0       NA  28.8  94.8
##  3  8027       1      0        0         0            0       1     0      0      0       NA  20.4  93.0
##  4  8265       1      1        1         0            1       1     0      1      0       NA  28.5  77.3
##  5  8291       0      0        0         0            0       1     0      0      0       NA  24    81.7
##  6  8296       1      0        0         0            0       1     0      0      0       NA  23    88.9
##  7  8310       1      0        1         0            0       1     0      1      0       NA  29.4  86.0
##  8  8313       1      0        0         0            0       1     1      1      0       NA  NA    87.3
##  9  8375       1      1        0         0           NA       1     0      0      0       NA  26.3  87.2
## 10  8512       1      0        0         0            1       1     1      1      0       NA  NA    78.9
## # ℹ 789 more rows
## # ℹ 4 more variables: education <dbl>, SRhealth <dbl>, gender <dbl>, p_year <dbl>

2.6.4 Personality Variables

# bring in codebook info
eas_pers <- eas_recode %>% 
  filter(category == "pers") %>%
  unnest(data) %>%
  left_join(eas_p_waves) %>% 
  filter(year == p_year)

# create composites
eas_pers <- eas_pers %>%
  group_by(SID, name, p_year) %>%
  summarize(value = mean(value, na.rm = T))  %>%
  ungroup()

eas_pers

## # A tibble: 3,995 × 4
##      SID name  p_year value
##    <dbl> <chr>  <dbl> <dbl>
##  1  6075 A       2016   3.6
##  2  6075 C       2016   3.4
##  3  6075 E       2016   2.5
##  4  6075 N       2016   2.8
##  5  6075 O       2016   3.1
##  6  8024 A       2006   3.7
##  7  8024 C       2006   4.1
##  8  8024 E       2006   2.7
##  9  8024 N       2006   2.7
## 10  8024 O       2006   2.9
## # ℹ 3,985 more rows

2.6.5 Outcome Variables

eas_out <- eas_recode %>%
  filter(category == "outcome") %>%
  unnest(data)

eas_out_waves <- eas_out %>%
  filter(!is.na(value)) %>%
  select(year, SID, name) %>%
  group_by(SID, name) %>%
  summarize(o_year = max(year)) %>% 
  ungroup()

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, name, year, long_rule, p_year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

eas_out <- eas_out %>%
  left_join(eas_p_waves) %>%
  filter(!is.na(value)) %>%
  mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

eas_out <- eas_out %>%
  filter(name == "dementia") %>%
  mutate(group = ifelse(year <= p_year, "past", "future")) %>%
  group_by(SID, name, group, p_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value), NA, value)) %>%
  pivot_wider(names_from = group, values_from = value) %>%
  group_by(SID, name) %>%
  mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                 ifelse(past == 0 & is.na(future), past, 
                 ifelse(past == 1, NA, NA)))) %>%
  ungroup() %>%
  full_join(eas_out %>%
    filter(name != "dementia") %>%
    group_by(SID, name, long_rule, p_year) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    select(-long_rule) %>%
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
  ) %>%
  left_join(eas_out_waves) %>%
  filter(!is.na(value))

eas_out

## # A tibble: 878 × 8
##      SID name     p_year  past future  `NA` value o_year
##    <dbl> <chr>     <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl>
##  1  8024 dementia   2006     0     NA    NA     0   2006
##  2  8027 dementia   2009     0      1    NA     1   2017
##  3  8265 dementia   2006     0      0    NA     0   2011
##  4  8291 dementia   2009     0      0    NA     0   2012
##  5  8296 dementia   2006     0     NA    NA     0   2006
##  6  8310 dementia   2007     0     NA    NA     0   2007
##  7  8313 dementia   2007     0     NA    NA     0   2007
##  8  8375 dementia   2006     0      0    NA     0   2010
##  9  8512 dementia   2010     0      0    NA     0   2013
## 10  8518 dementia   2011     0     NA    NA     0   2011
## # ℹ 868 more rows

2.6.6 Cognition Variables

# composite within years
eas_cog <- eas_recode %>%
  filter(category == "cognition") %>%
  select(-category) %>%
  unnest(data) %>%
  left_join(eas_p_waves) %>%
  filter(!is.na(p_year) & !is.na(value)) %>%
  group_by(name, itemname, year) %>%
  mutate(value = pomp(value)) %>%
  group_by(SID, name, itemname) %>%
  filter(year %in% (p_year - 1):(p_year + 1)) %>%
  group_by(name, SID) %>%
  summarize(value = mean(value, na.rm = T)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name"
              , values_from = "value")

eas_cog

## # A tibble: 799 × 2
##      SID cognition
##    <dbl>     <dbl>
##  1  6075      5.78
##  2  8024      4.83
##  3  8027      4.55
##  4  8265      5.60
##  5  8291      4.64
##  6  8296      5.20
##  7  8310      4.16
##  8  8313      4.37
##  9  8375      6.10
## 10  8512      3.62
## # ℹ 789 more rows

2.6.7 Combine Data

eas_combined <- eas_pers %>% 
  select(SID, Trait = name, p_value = value, p_year) %>%
  full_join(
    eas_out %>% 
      select(SID, o_year, Outcome = name, o_value = value)
    ) %>%
  filter(!is.na(o_value) & !is.na(p_value)) %>%
  distinct() %>%
  left_join(
    eas_out %>%
      filter(name == "dementia") %>%
      mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
      select(-future, -past) %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  left_join(eas_cov) %>%
  left_join(eas_cog)

eas_combined

## # A tibble: 4,320 × 25
##      SID Trait p_value p_year o_year Outcome      o_value  `NA` dementia alcohol cancer diabetes heartProb
##    <dbl> <chr>   <dbl>  <dbl>  <dbl> <chr>          <dbl> <dbl>    <dbl>   <dbl>  <dbl>    <dbl>     <dbl>
##  1  8024 A         3.7   2006   2006 dementia         0      NA        0       1      0        0         0
##  2  8024 C         4.1   2006   2006 dementia         0      NA        0       1      0        0         0
##  3  8024 E         2.7   2006   2006 dementia         0      NA        0       1      0        0         0
##  4  8024 N         2.7   2006   2006 dementia         0      NA        0       1      0        0         0
##  5  8024 O         2.9   2006   2006 dementia         0      NA        0       1      0        0         0
##  6  8027 A         3.2   2009   2017 dementia         1      NA        1       1      0        0         0
##  7  8027 A         3.2   2009   2017 braak            5.5    NA        1       1      0        0         0
##  8  8027 A         3.2   2009   2017 hipSclerosis     0      NA        1       1      0        0         0
##  9  8027 A         3.2   2009   2017 lewyBodyDis      1      NA        1       1      0        0         0
## 10  8027 C         3.4   2009   2017 dementia         1      NA        1       1      0        0         0
## # ℹ 4,310 more rows
## # ℹ 12 more variables: hypertension <dbl>, married <dbl>, race <dbl>, smokes <dbl>, stroke <dbl>,
## #   exercise <dbl>, BMI <dbl>, age <dbl>, education <dbl>, SRhealth <dbl>, gender <dbl>, cognition <dbl>

save(eas_cov, eas_pers, eas_out, eas_combined, eas_cog,
     file = sprintf("%s/data/clean/eas_cleaned.RData", load_path))

rm(list =ls()[grepl("eas", ls())])

2.7 German Socioeconomic Panel Study (GSOEP)

The German Socioeconomic Panel Study (GSOEP; Socio-Economic Panel, 2017) is an ongoing longitudinal study of German collected by the German Institute of Economic Research (DIW Berlin). The data are freely available at https://www.diw.de/soep by application.

Data have been collected annually since 1984 (the latest data release includes data up to 2017). Participants have been recruited from more than 11,000 households, which are nationally representative of private German households. 20,000 individuals are sampled each year, on average. It is critical to note that the GSOEP samples households, not individuals, and the households consist of individuals living in both the “old” and “new” federal states (the former West and East Germany), foreigners, and recent immigrants to Germany.

Sample size varies by year, ranging from approximately 10,000 (1989) to 31,000 (2013). This provides 99% power to detect a zero-order correlation effect size of ~.06, two-tailed at alpha < .05.

2.7.1 Load Data

gsoep_read_fun <- function(Year, WL){
  old.names <- (gsoep_codebook %>% filter(year == Year | category == "proc"))$orig_itemname 
  p <- sprintf("%s/gsoep/%sp.sav", data_path, WL) %>% haven::read_sav(.) %>%
    full_join(sprintf("%s/gsoep/%skind.sav", data_path, WL) %>% haven::read_sav(.)) %>%
    full_join(sprintf("%s/gsoep/%spequiv.sav", data_path, WL) %>% haven::read_sav(.)) %>%
    full_join(sprintf("%s/gsoep/%spgen.sav", data_path, WL) %>% haven::read_sav(.)) %>%
    full_join(sprintf("%s/gsoep/%spkal.sav", data_path, WL) %>% haven::read_sav(.)) %>%
    select(one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -persnr, -hhnr, na.rm = T)
  
 sprintf("%s/gsoep/%shbrutto.sav", data_path, WL) %>% haven::read_sav(.) %>%
    full_join(sprintf("%s/gsoep/%sh.sav", data_path, WL) %>% haven::read_sav(.)) %>%
    select(one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, -hhnr, na.rm = T) %>%
    full_join(p %>% select(persnr, hhnr) %>% distinct()) %>%
    full_join(p) 
}

gsoep_codebook <- (codebook %>% filter(study == "GSOEP"))$codebook[[1]] %>%
  mutate(orig_itemname = str_to_lower(orig_itemname))
gsoep_codebook

## # A tibble: 594 × 17
##    study dataset category name  itemname  wave waveletter  year orig_itemname description scale reverse_code
##    <chr> <chr>   <chr>    <chr> <chr>    <dbl> <chr>      <dbl> <chr>         <chr>       <chr> <chr>       
##  1 gosep cognit  cogniti… cogn… animals1    23 w           2006 f99z30r       Symbol Dig… inte… no          
##  2 gosep cognit  cogniti… cogn… animals1    29 bc          2012 f99z30r       Symbol Dig… inte… no          
##  3 gosep cognit  cogniti… cogn… animals2    23 w           2006 f99z60r       Symbol Dig… inte… no          
##  4 gosep cognit  cogniti… cogn… animals2    29 bc          2012 f99z60r       Symbol Dig… inte… no          
##  5 gosep cognit  cogniti… cogn… animals3    23 w           2006 f99z90r       Symbol Dig… inte… no          
##  6 gosep cognit  cogniti… cogn… animals3    29 bc          2012 f99z90r       Symbol Dig… inte… no          
##  7 gosep cognit  cogniti… cogn… symDig1     23 w           2006 f96t30g       Symbol Dig… inte… no          
##  8 gosep cognit  cogniti… cogn… symDig1     29 bc          2012 f96t30g       Symbol Dig… inte… no          
##  9 gosep cognit  cogniti… cogn… symDig2     23 w           2006 f96t60g       Symbol Dig… inte… no          
## 10 gosep cognit  cogniti… cogn… symDig2     29 bc          2012 f96t60g       Symbol Dig… inte… no          
## # ℹ 584 more rows
## # ℹ 5 more variables: recode <chr>, mini <dbl>, maxi <dbl>, comp_rule <chr>, long_rule <chr>

gsoep <- gsoep_codebook %>% 
  select(wave, waveletter, year) %>%
  filter(complete.cases(.)) %>%
  distinct() %>%
  arrange(year) %>%
  filter(year != "2018") %>%
  mutate(data = map2(year, waveletter, gsoep_read_fun)) 

old.names <- unique(gsoep_codebook$orig_itemname)
gsoep_cog <- sprintf("%s/gsoep/cognit.sav", data_path) %>% haven::read_sav(.) %>%
  select(persnr, hhnr, one_of(old.names)) %>%
  haven::zap_labels(.) %>%
  select(-hhnr) %>%
  gather(key = orig_itemname, value = value, -persnr, na.rm = T)

gsoep_long <- gsoep %>% unnest(data) %>%
  select(-hhnr, -wave, -waveletter, -year) %>%
  # filter(persnr %in% gsoep_cog_subs) %>%
  full_join(gsoep_cog) %>%
  rename(SID = persnr)

save(gsoep, file = sprintf("%s/data/clean/gsoep_raw.RData", load_path))
rm(gsoep)

2.7.2 Recoding & Reverse Scoring

gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()

# join data with recoding info 
gsoep_recode <- gsoep_codebook %>%
  filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
  select(category, name, itemname, wave, year, orig_itemname, reverse_code:long_rule) %>%
  group_by(category, name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map(data, ~(.) %>% left_join(gsoep_long))) 

# recode 
recode_fun <- function(rule, y, year){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}

gsoep_recode <- gsoep_recode %>% 
  mutate(data = map(data, ~(.) %>% 
    group_by(recode, year) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
    unnest(data) %>%
    mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))

# reverse code 
gsoep_recode <- gsoep_recode %>%
  mutate(data = map(data, ~(.) %>%
    mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

2.7.3 Covariates

load(sprintf("%s/data/clean/gsoep_cleaned.RData", local_path))

yrBrth <- gsoep_recode %>% 
  filter(name == "yearBrth") %>% 
  unnest(data) %>%
  group_by(SID) %>% 
  summarize(yearBrth = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth),
         yearBrth = ifelse(yearBrth < 1000, yearBrth + 1000, yearBrth)) 

# compositing within years
year_comp_fun <- function(df, rule, name){
  print(paste(rule, name))
  df %>%
    group_by(SID, yearBrth, year, long_rule) %>% # group by person and item (collapse across age)
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()

gsoep_cov <- gsoep_recode %>%
  filter(category == "covariates") %>%
  mutate(data = ifelse(name == "alcohol", map(data, ~(.) %>% mutate(year = ifelse(year == 2006, 2005, year))), data)) %>%
  mutate(data = map(data, ~(.) %>%
    left_join(yrBrth) %>%
    filter(year <= max(gsoep_waves$Used) & !is.na(value)) %>%
    group_by(comp_rule) %>%
    nest() %>%
    ungroup() %>%
    mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)))) %>%
  filter(map(data, nrow) > 0) %>%
  unnest(data) %>%
  mutate(data = pmap(list(data, comp_rule, name), year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  gsoep_cov %>%
    filter(year <= p_year  & long_rule == rule) %>%
    group_by(SID, yearBrth, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

gsoep_cov <- crossing(
  p_year = gsoep_waves$Used, 
  long_rule = unique(gsoep_cov$long_rule)
  ) %>%
  mutate(data = map2(long_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-long_rule, -yearBrth) %>%
  pivot_wider(names_from = name, values_from = value) %>%
  mutate(yearBrth = ifelse(yearBrth < 1000, yearBrth + 1000, yearBrth))

gsoep_cov

## # A tibble: 114,106 × 16
##    p_year   SID exercise height weight cancer diabetes married stroke alcohol education smokes gender
##     <dbl> <dbl>    <dbl>  <dbl>  <dbl>  <dbl>    <dbl>   <dbl>  <dbl>   <dbl>     <dbl>  <dbl>  <dbl>
##  1   2005   101     1.67     NA     NA      0        0       1      0      NA        NA     NA      0
##  2   2005   102     1        NA     NA      0        0       1      0      NA        NA     NA      1
##  3   2005   103     2.67     NA     NA      0        0       0      0      NA        NA     NA      0
##  4   2005   201     2       157     61      0        0       0      0       1        11      0      1
##  5   2005   202     3        NA     NA      0        0       0      0      NA        NA     NA      1
##  6   2005   203     4       177     77      0        0       0      0       1        NA      0      0
##  7   2005   301     1        NA     NA      0        0       1      0      NA        NA     NA      0
##  8   2005   302     1        NA     NA      0        0       1      0      NA        NA     NA      1
##  9   2005   401     1        NA     NA      0        0       0      0      NA        NA     NA      0
## 10   2005   501     1        NA     NA      0        0       0      0      NA        NA     NA      1
## # ℹ 114,096 more rows
## # ℹ 3 more variables: yearBrth <dbl>, age <dbl>, SRhealth <dbl>

2.7.4 Personality Variables

gsoep_pers <- gsoep_recode %>%
  filter(category == "pers") %>%
  unnest(data) %>%
  filter(year %in% gsoep_waves$Used) %>%
  distinct()

# alpha's
gsoep_alpha <- gsoep_pers %>%
  select(name, itemname, year, SID, value) %>%
  group_by(name, year) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-persnr)), NA_real_)))

comp_fun <- function(df, rule){
  df %>%
    group_by(SID) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

# create composites
gsoep_pers <- gsoep_pers %>%
  group_by(name, comp_rule, year) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, comp_fun)) %>%
  unnest(data) %>% 
  select(-comp_rule) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))

gsoep_pers

## # A tibble: 191,511 × 4
##    name   year   SID value
##    <chr> <dbl> <dbl> <dbl>
##  1 A      2005   201  7   
##  2 A      2005   203  4.67
##  3 A      2005   602  4.33
##  4 A      2005   901  4.67
##  5 A      2005  1202  7   
##  6 A      2005  1501  4.67
##  7 A      2005  1601  4   
##  8 A      2005  1602  6.67
##  9 A      2005  1603  4.33
## 10 A      2005  1701  7   
## # ℹ 191,501 more rows

2.7.5 Outcome Variables

gsoep_pers_subs <- unique(gsoep_pers$SID)
gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()

# composite within years 
# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    group_by(SID, name, year) %>% # group by person and item (collapse across age)
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}


gsoep_out <- gsoep_recode %>%
  filter(category == "outcome") %>%
  unnest(data) %>%
  # filter(year <= max(gsoep_waves$Used)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(comp_rule = ifelse(comp_rule == "select", "skip", comp_rule),
         data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

# composite across years
comp_fun <- function(p_year){
  gsoep_out %>%
    group_by(SID, name) %>%
    summarize(value = max(value, na.rm = T)) %>%
    ungroup() %>%
    mutate(value = ifelse(is.infinite(value), NA, value)) 
}

gsoep_out <- tibble(p_year = gsoep_waves$Used) %>%
  mutate(data = map(p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(o_year = 2017)

gsoep_out

## # A tibble: 109,214 × 5
##    p_year   SID name     value o_year
##     <dbl> <dbl> <chr>    <dbl>  <dbl>
##  1   2005   602 dementia     0   2017
##  2   2005   604 dementia     0   2017
##  3   2005   901 dementia     0   2017
##  4   2005  1501 dementia     0   2017
##  5   2005  1601 dementia     0   2017
##  6   2005  1602 dementia     0   2017
##  7   2005  2301 dementia     0   2017
##  8   2005  2302 dementia     0   2017
##  9   2005  4701 dementia     0   2017
## 10   2005  4901 dementia     0   2017
## # ℹ 109,204 more rows

2.7.6 Cognition Variables

gsoep_cog <- gsoep_recode %>%
  filter(category == "cognition") %>%
  unnest(data) %>% 
  filter(!is.na(value))

gsoep_cog_waves <- gsoep_cog %>%
  select(itemname, SID, year) %>%
  group_by(SID, itemname) %>%
  summarize(o_year = max(year)) %>%
  ungroup()

gsoep_cog <- gsoep_cog %>%
  right_join(gsoep_cog_waves) %>%
  filter(year == o_year) %>%
  filter(!is.na(value)) %>%
  group_by(name, itemname, year) %>%
  mutate(value = pomp(value)) %>%
  group_by(name, o_year, SID) %>%
  summarize(value = mean(value)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name", values_from = "value")

gsoep_cog

## # A tibble: 22,445 × 3
##    o_year   SID cognition
##     <dbl> <dbl>     <dbl>
##  1   2012   201      0   
##  2   2012  1501      3.81
##  3   2012  5601      2.06
##  4   2012  6002      2.46
##  5   2012  7302      1.44
##  6   2012  8603      3.75
##  7   2012  9801      3.88
##  8   2012 11301      3.45
##  9   2012 12303      2.66
## 10   2012 13401      1.98
## # ℹ 22,435 more rows

2.7.7 Combine Data

gsoep_combined <- gsoep_pers %>% 
  rename(Trait = name, p_value = value, p_year = year) %>%
  full_join(gsoep_out %>% select(p_year, SID, Outcome = name, o_year, o_value = value)) %>%
  full_join(gsoep_cov) %>%
  left_join(
    gsoep_out %>%
      filter(name == "dementia") %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  full_join(gsoep_cog) %>%
  filter(!is.na(p_value) & !is.na(o_value)) %>%
  mutate(age = p_year - yearBrth
         , BMI = weight/((height/100)^2))

gsoep_combined

## # A tibble: 136,465 × 24
##    Trait p_year   SID p_value Outcome  o_year o_value exercise height weight cancer diabetes married stroke
##    <chr>  <dbl> <dbl>   <dbl> <chr>     <dbl>   <dbl>    <dbl>  <dbl>  <dbl>  <dbl>    <dbl>   <dbl>  <dbl>
##  1 A       2005   602    4.33 dementia   2017       0     2.8    177    87        0        0       1      0
##  2 A       2005   901    4.67 dementia   2017       0     2.8    158    47        0        0       0      0
##  3 A       2005  1501    4.67 dementia   2017       0     1      168.   84        0        0       0      0
##  4 A       2005  1601    4    dementia   2017       0     2.67   174.   75        0        0       1      0
##  5 A       2005  1602    6.67 dementia   2017       0     2      160.   48        0        0       1      0
##  6 A       2005  2301    5    dementia   2017       0     1.8    180    75        0        0       1      0
##  7 A       2005  2302    3.33 dementia   2017       0     3      157    46.5      0        0       1      0
##  8 A       2005  4701    6    dementia   2017       0     2.2    168    70        0        0       0      0
##  9 A       2005  4901    5.33 dementia   2017       0     1      162.   66.5      0        0       0      0
## 10 A       2005  5201    6.33 dementia   2017       0     1.07   183    83        0        0       1      0
## # ℹ 136,455 more rows
## # ℹ 10 more variables: alcohol <dbl>, education <dbl>, smokes <dbl>, gender <dbl>, yearBrth <dbl>,
## #   age <dbl>, SRhealth <dbl>, dementia <dbl>, cognition <dbl>, BMI <dbl>

save(gsoep_cov, gsoep_alpha, gsoep_pers, gsoep_out, gsoep_combined, gsoep_cog,
     file = sprintf("%s/data/clean/gsoep_cleaned.RData", load_path))

rm(list =ls()[grepl("gsoep", ls())])

2.8 The Longitudinal Studies for the Social sciences (LISS)

The Longitudinal Studies for the Social sciences (LISS; Scherpenzeel, Das, Ester, & Kaczmirek, 2010) is an ongoing longitudinal study of households in the Netherlands. These data are online, through application, from https://statements.centerdata.nl/liss-panel-data-statement.

Participants were approximately 8,000 Dutch-speaking individuals permanently residing in the Netherlands from 5,000 households. Data have been collected annually since 2007. The latest data release includes 11 waves of data from 2008 to 2018. More documentation are available at https://www.dataarchive.lissdata.nl/study_units/view/1.

Sample sizes vary by year, ranging from 5,021 (2018) to 6808 (2008). This provides 99/% power to detect a correlation effect size of ~.04, two-tailed at alpha .05.

2.8.1 Load Data

liss_read_fun <- function(x){
  sprintf("%s/liss/%s", data_path, x) %>% haven::read_sav(.) %>% select(one_of(old.names))
}

liss_codebook <- (codebook %>% filter(study == "LISS"))$codebook[[1]]
liss_codebook

## # A tibble: 1,327 × 17
##    study dataset    category   name      itemname  wave wave_letter  year orig_itemname description    scale
##    <chr> <chr>      <chr>      <chr>     <chr>    <dbl> <chr>       <dbl> <chr>         <chr>          <chr>
##  1 liss  ai07a      cognition  cognition BSI          1 a            2008 ai07a031      BSI problems … "1\t…
##  2 liss  ai08b      cognition  cognition BSI          2 b            2009 ai08b031      BSI problems … "1\t…
##  3 liss  ai08c      cognition  cognition BSI          3 c            2010 ai08c031      BSI problems … "1\t…
##  4 liss  ai08d      cognition  cognition BSI          4 d            2011 ai08d031      BSI problems … "1\t…
##  5 liss  avars_2008 covariates yearBrth  YOB1         1 a            2008 gebjaar       Year of Birth  "num…
##  6 liss  avars_2009 covariates yearBrth  YOB1         2 b            2009 gebjaar       Year of Birth  "num…
##  7 liss  avars_2010 covariates yearBrth  YOB1         3 c            2010 gebjaar       Year of Birth  "num…
##  8 liss  avars_2011 covariates yearBrth  YOB1         4 d            2011 gebjaar       Year of Birth  "num…
##  9 liss  avars_2012 covariates yearBrth  YOB1         5 e            2012 gebjaar       Year of Birth  "num…
## 10 liss  avars_2013 covariates yearBrth  YOB1         6 f            2013 gebjaar       Year of Birth  "num…
## # ℹ 1,317 more rows
## # ℹ 6 more variables: reverse_code <chr>, recode <chr>, mini <dbl>, maxi <dbl>, comp_rule <chr>,
## #   long_rule <chr>

old.names <- unique(liss_codebook$orig_itemname) %>% str_to_lower
datasets <- sprintf("%s/liss", data_path) %>% list.files()
liss <- tibble(datasets = datasets) %>%
  mutate(data = map(datasets, liss_read_fun)) 

liss <- reduce(liss$data, full_join) %>% haven::zap_labels(.)
save(liss, file = sprintf("%s/data/clean/liss_raw.RData", load_path))

avars <- tibble(ds = datasets[grepl("avar", datasets)]) %>%
  mutate(data = map(ds, ~sprintf("%s/liss/%s", data_path, .) %>% 
                      haven::read_sav(.) %>% 
                      select(one_of(old.names)) %>% 
                      haven::zap_labels(.))) %>%
  separate(ds, c("ds", "year", "scrap1", "scrap2"), sep = "_") %>%
  separate(year, c("year", "month"), -2) %>%
  select(year, month, data) %>% 
  unnest(data)

2.8.2 Recoding & Reverse-Scoring

rename_fun <- function(cb, var){
  print(var)
  old.names <- unique((liss_codebook %>% filter(name == var))$orig_itemname)
  df <- liss %>% 
    select(SID = nomem_encr, HHID = nohouse_encr, one_of(old.names)) %>%
    gather(key = orig_itemname, value = value, 
           -SID, -HHID, na.rm=T)
  if(length(old.names) > 1){
      df <- df %>% left_join(cb %>% select(itemname, year, orig_itemname, reverse_code:long_rule))
  } else {
    df <- df %>% left_join(cb %>% select(-(itemname:year)) %>% distinct()) %>% mutate(year = 0)
  }
  if(var %in% c("yearBrth", "gender")) df <- df %>% left_join(avars %>% select(-category, -name))
  return(df)
}

avars <- avars %>%
  select(SID = nomem_encr, HHID = nohouse_encr, everything()) %>%
  group_by(SID, HHID, year) %>%
  summarize_at(vars(gebjaar, geslacht), Mode) %>%
  ungroup() %>%
  pivot_longer(cols = c("gebjaar", "geslacht")
               , names_to = "orig_itemname"
               , values_to = "value") %>%
  mutate(year = as.numeric(year)) %>%
  left_join(
    liss_codebook %>% 
      select(category, name, itemname, year, orig_itemname, reverse_code:long_rule)
    )

# rename variables   
liss_recode <- liss_codebook %>%
  filter(category %in% c("covariates", "pers", "outcome", "cognition")) %>%
  select(category, name:wave, year:orig_itemname, reverse_code:long_rule) %>%
  group_by(category, name) %>% 
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, name, rename_fun))


recode_fun <- function(rule, y){
  x <- y$value
  if(!is.na(rule)){y$value <- eval(parse(text = rule))}
  return(y)
}


liss_recode <- liss_recode %>% 
  mutate(data = map(data, ~(.) %>% 
    group_by(recode) %>%
    nest() %>%
    ungroup() %>%
    mutate(data = pmap(list(recode, data), recode_fun)) %>%
    unnest(data) %>%
    mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))


# reverse code 
liss_recode <- liss_recode %>%
  mutate(data = map(data, ~(.) %>%
          mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value, 
                        reverse.code(-1, value, mini = mini, maxi = maxi)))))

fun_call <- function(x, rule){
    switch(rule,
           average = mean(x, na.rm = T),
           mode = Mode(x)[1],
           sum = sum(x, na.rm = T),
           skip = unique(x)[1],
           select = unique(x)[1],
           max = max(x, na.rm = T),
           min = min(x, na.rm = T))
  }

2.8.3 Covariates

load(sprintf("%s/data/clean/liss_cleaned.RData", local_path))

# bring in year or birth for cleaning
liss_cov <- liss_recode %>% 
  filter(category == "covariates") %>% 
  unnest(data) %>%
  distinct()

# compositing within years
year_comp_fun <- function(df, rule){
  df %>%
    # group by person and item (collapse across age)
    group_by(SID, HHID, long_rule, name, year) %>% 
    summarize(value = fun_call(value, rule)) %>%
    ungroup() %>% 
    mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}

liss_waves <- p_waves %>% filter(Study == "LISS") %>% select(Used) %>% distinct()

liss_cov <- liss_cov %>%
  filter(year <= max(liss_waves$Used)) %>%
  group_by(comp_rule) %>%
  nest() %>%
  ungroup() %>%
  mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
  unnest(data)

comp_fun <- function(rule, p_year){
  liss_cov %>%
    filter(year <= p_year  & long_rule == rule) %>%
    group_by(SID, HHID, name) %>%
    summarize(value = fun_call(value, rule)) %>%
    ungroup()
}

liss_cov <- crossing(
  p_year = liss_waves$Used, 
  long_rule = unique(liss_cov$long_rule)
  ) %>%
  mutate(data = map2(long_rule, p_year, comp_fun)) %>%
  unnest(data) %>%
  mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
  select(-long_rule) %>%
  pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>%
  mutate(BMI = weight/((height/100)^2))

liss_cov

## # A tibble: 27,202 × 21
##    p_year    SID   HHID weight exercise alcohol cancer diabetes education ethnicity heartProb height
##     <dbl>  <dbl>  <dbl>  <dbl>    <dbl>   <dbl>  <dbl>    <dbl>     <dbl>     <dbl>     <dbl>  <dbl>
##  1   2008 800033 583404   58         NA       1      0        0        12         2         0    165
##  2   2008 800042 500277   74.5        0       1      0        0        16        NA         0    169
##  3   2008 800045 548654   84         NA       1      0        0        NA        NA         0    185
##  4   2008 800057 580532   85         NA       1      0        0        18        NA         0    198
##  5   2008 800076 578048   57          0       1      0        0        NA        NA         0    167
##  6   2008 800119 537783   84.5       NA       0      1        0        NA         2         0    174
##  7   2008 800125 582101   61         NA       1      0        0        16         2         0    158
##  8   2008 800134 549826   66         NA       1      0        0        NA        NA         0    162
##  9   2008 800158 519049   87.5       NA       1      0        0        14        NA         0    180
## 10   2008 800170 520571   56.5        0       1      0        0        14        NA         0    160
## # ℹ 27,192 more rows
## # ℹ 9 more variables: parkinsons <dbl>, respDis <dbl>, smokes <dbl>, stroke <dbl>, married <dbl>,
## #   gender <dbl>, yearBrth <dbl>, SRhealth <dbl>, BMI <dbl>

2.8.4 Personality Variables

liss_pers <- liss_recode %>%
  filter(category == "pers") %>%
  unnest(data) %>%
  filter(year == liss_waves$Used) %>%
  distinct()

# alpha's
liss_alpha <- liss_pers %>%
  select(name, itemname, year, SID, value) %>%
  group_by(name, year) %>%
  nest() %>%
  mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
         alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))

# create composites
liss_pers <- liss_pers %>%
  group_by(SID, HHID, name, year) %>%
  summarize(value = mean(value, na.rm = T))  %>%
  ungroup()

liss_pers

## # A tibble: 54,280 × 5
##       SID   HHID name   year value
##     <dbl>  <dbl> <chr> <dbl> <dbl>
##  1 800033 583404 A      2008   2.8
##  2 800033 583404 C      2008   2.3
##  3 800033 583404 E      2008   2.4
##  4 800033 583404 N      2008   2.7
##  5 800033 583404 NA     2008   3.3
##  6 800033 583404 O      2008   3.4
##  7 800033 583404 PA     2008   4.2
##  8 800033 583404 SWL    2008   6  
##  9 800042 500277 A      2008   3.7
## 10 800042 500277 C      2008   3.9
## # ℹ 54,270 more rows

2.8.5 Cognition Variables

liss_cog <- liss_recode %>%
  filter(category == "cognition") %>%
  unnest(data) %>%
  filter(year == liss_waves$Used) %>%
  filter(!is.na(value)) %>%
  group_by(name, itemname, year) %>%
  mutate(value = pomp(value)) %>%
  distinct() %>%
  group_by(name, SID) %>%
  summarize(value = mean(value)) %>%
  ungroup() %>%
  pivot_wider(names_from = "name", values_from = "value")

liss_cog

## # A tibble: 1,804 × 2
##       SID cognition
##     <dbl>     <dbl>
##  1 800076       2.5
##  2 800170       0  
##  3 800186       7.5
##  4 800231       2.5
##  5 800326       0  
##  6 800354       2.5
##  7 800424       7.5
##  8 800540       5  
##  9 800601       5  
## 10 800790       5  
## # ℹ 1,794 more rows

2.8.6 Outcome Variables

liss_out <- liss_recode %>%
  filter(category == "outcome") %>%
  unnest(data) %>%
  mutate(p_year = liss_waves$Used, 
         group = ifelse(year > p_year, "future", "past")) %>%
  group_by(SID, name, p_year) %>%
  mutate(o_year = max(year[!is.na(value)])) %>%
  group_by(SID, name, group, p_year, o_year) %>%
  summarize(value = max(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(value = ifelse(is.infinite(value), NA, value)) %>%
  pivot_wider(names_from = group, values_from = value) %>%
  group_by(SID, p_year, name, o_year) %>%
  mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
                 ifelse(past == 0 & is.na(future), past, 
                 ifelse(past == 1, NA, NA)))) %>% 
  ungroup() %>%
  filter(!is.na(value))

liss_out

## # A tibble: 13,452 × 7
##       SID name     p_year o_year future  past value
##     <dbl> <chr>     <dbl>  <dbl>  <dbl> <dbl> <dbl>
##  1 800009 dementia   2008   2018      0    NA     0
##  2 800012 dementia   2008   2015      0    NA     0
##  3 800015 dementia   2008   2018      0    NA     0
##  4 800018 dementia   2008   2012      0    NA     0
##  5 800033 dementia   2008   2012      0     0     0
##  6 800042 dementia   2008   2016      0     0     0
##  7 800045 dementia   2008   2007     NA     0     0
##  8 800054 dementia   2008   2018      0    NA     0
##  9 800057 dementia   2008   2018      0     0     0
## 10 800073 dementia   2008   2018      0    NA     0
## # ℹ 13,442 more rows

2.8.7 Combine Data

liss_combined <- liss_pers %>% 
  rename(Trait = name, p_value = value, p_year = year) %>%
  full_join(liss_out %>% rename(Outcome = name, o_value = value)) %>%
  full_join(liss_cov) %>%
  left_join(
    liss_out %>%
      filter(name == "dementia") %>%
      mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
      select(-future, -past) %>%
      pivot_wider(names_from = "name", values_from = "value")
    ) %>%
  full_join(liss_cog) %>%
  filter(!is.na(p_value) & !is.na(o_value)) %>%
  mutate(age = p_year - yearBrth)

liss_combined

## # A tibble: 52,267 × 31
##       SID   HHID Trait p_year p_value Outcome  o_year future  past o_value weight exercise alcohol cancer
##     <dbl>  <dbl> <chr>  <dbl>   <dbl> <chr>     <dbl>  <dbl> <dbl>   <dbl>  <dbl>    <dbl>   <dbl>  <dbl>
##  1 800033 583404 A       2008     2.8 dementia   2012      0     0       0   58         NA       1      0
##  2 800033 583404 C       2008     2.3 dementia   2012      0     0       0   58         NA       1      0
##  3 800033 583404 E       2008     2.4 dementia   2012      0     0       0   58         NA       1      0
##  4 800033 583404 N       2008     2.7 dementia   2012      0     0       0   58         NA       1      0
##  5 800033 583404 NA      2008     3.3 dementia   2012      0     0       0   58         NA       1      0
##  6 800033 583404 O       2008     3.4 dementia   2012      0     0       0   58         NA       1      0
##  7 800033 583404 PA      2008     4.2 dementia   2012      0     0       0   58         NA       1      0
##  8 800033 583404 SWL     2008     6   dementia   2012      0     0       0   58         NA       1      0
##  9 800042 500277 A       2008     3.7 dementia   2016      0     0       0   74.5        0       1      0
## 10 800042 500277 C       2008     3.9 dementia   2016      0     0       0   74.5        0       1      0
## # ℹ 52,257 more rows
## # ℹ 17 more variables: diabetes <dbl>, education <dbl>, ethnicity <dbl>, heartProb <dbl>, height <dbl>,
## #   parkinsons <dbl>, respDis <dbl>, smokes <dbl>, stroke <dbl>, married <dbl>, gender <dbl>,
## #   yearBrth <dbl>, SRhealth <dbl>, BMI <dbl>, dementia <dbl>, cognition <dbl>, age <dbl>

save(liss_cov, liss_alpha, liss_pers, liss_out, liss_combined, liss_cog, 
     file = sprintf("%s/data/clean/liss_cleaned.RData", local_path))

rm(list =ls()[grepl("liss", ls())])