Chapter 2 Data Cleaning
In this section, we will clean the data for each study. Raw data cannot be shared directly, but for each study, we include instrucitons on how to access the data.
Mode <- function(x) {
ux <- unique(x)
ux <- ux[!is.na(ux)]
ux[which.max(tabulate(match(x, ux)))]
}
pomp <- function(x) (x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))*10
2.1 Health and Retirement Study (HRS)
The Health and Retirement Study [HRS; (juster1995overview?)] is an ongoing longitudinal study of households in the United States. These data are available at https://hrs.isr.umich.edu by creating a free account.
Participants were recruited from more than 35,000 individuals from the financial households of individuals born between 1931 and 1941 in the US. Data have been collected biannually since 1992. The latest data release includes data up to 2016. On average, 10,000 individuals are sampled each wave More information on the HRS can be found at https://hrs.isr.umich.edu/documentation/survey-design, but, in short, the HRS is a nationally representative sample of adults over 50 in the US. It is critical to note that the HRS samples households of the original cohort and follows individuals and their spouses or partners until their death.
Sample size varies by year, ranging from approximately 7,500 (2014) to 15,500 (1992). (https://hrs.isr.umich.edu/sites/default/files/biblio/ResponseRates_2017.pdf). This provides 99% power to detect a zero-order correlation effect size of ~.04, two-tailed at alpha .05.
2.1.1 Load Data
hrs_read_fun <- function(year) {
read_da <- function(da, dct, Year){
print(paste(da, dct, year, sep = " "))
data.file <- sprintf("%s/hrs/%s/%s", data_path, Year, da)
# Set path to the dictionary file "*.DCT"
dict.file <- sprintf("%s/hrs/%s/%s", data_path, Year, dct)
# Read the dictionary file
df.dict <- read.table(dict.file, skip = 1, fill = TRUE, stringsAsFactors = FALSE)
# Set column names for dictionary dataframe
colnames(df.dict) <- c("col.num","col.type","col.name","col.width","col.lbl")
# Remove last row which only contains a closing }
row <- which(df.dict$col.name == "HHID")
df.dict <- df.dict[-nrow(df.dict),]
if(row == 2){df.dict <- df.dict[-1,]}
# Extract numeric value from column width field
df.dict$col.width <- as.integer(sapply(df.dict$col.width, gsub, pattern = "[^0-9\\.]", replacement = ""))
# Convert column types to format to be used with read_fwf function
df.dict$col.type <- sapply(df.dict$col.type, function(x) ifelse(x %in% c("int","byte","long"), "i", ifelse(x == "float", "n", ifelse(x == "double", "d", "c"))))
# Read the data file into a dataframe
df <- read_fwf(file = data.file, fwf_widths(widths = df.dict$col.width, col_names = df.dict$col.name), col_types = paste(df.dict$col.type, collapse = ""))
# Add column labels to headers
attributes(df)$variable.labels <- df.dict$col.lbl
old.names <- (hrs_codebook %>% filter(year == Year))$orig_itemname
if(any(c("PN", "HHID") %in% colnames(df)) & any(old.names %in% colnames(df))){
# if(any(c("PN", "HHID") %in% colnames(df))){
df <- df %>%
mutate(hhidpn = 1000*as.numeric(HHID) + as.numeric(PN)) %>%
select(one_of(c("PN", "HHID")), one_of(old.names)) %>%
distinct()
# gather(key = item, value = value, -hhidpn)
} else {df <- NA}
return(df)
}
# Set path to the data file "*.DA"
files <- list.files(sprintf("%s/hrs/%s", data_path, year))
df2 <- tibble(
da = files[grepl(".da", files) | grepl(".DA", files)],
dct = files[grepl(".dct", files) | grepl(".DCT", files)]
) %>%
mutate(data = map2(da, dct, possibly(~read_da(.x, .y, year), NA_real_))) %>%
filter(!is.na(data)) %>%
select(-da, -dct)
if(nrow(df2) != 0){df2$data %>% reduce(full_join) %>% distinct()} else {NA}
}
hrs_codebook <- (codebook %>% filter(study == "HRS"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_upper(orig_itemname)) %>%
mutate_at(vars(orig_itemname, name, itemname), ~str_remove_all(., "[[:space:]]"))
hrs_codebook
## # A tibble: 702 × 17
## study dataset category name itemname wave waveletter year orig_itemname description scale reverse_code
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 hrs Rand cogniti… cogn… digitsB… 3 <NA> 1996 R3BWC20 BACKWARDS … "0.I… no
## 2 hrs Rand cogniti… cogn… digitsB… 4 <NA> 1998 R4BWC20 BACKWARDS … "0.I… no
## 3 hrs Rand cogniti… cogn… digitsB… 5 <NA> 2000 R5BWC20 BACKWARDS … "0.I… no
## 4 hrs Rand cogniti… cogn… digitsB… 6 <NA> 2002 R6BWC20 BACKWARDS … "0.I… no
## 5 hrs Rand cogniti… cogn… digitsB… 7 <NA> 2004 R7BWC20 BACKWARDS … "0.I… no
## 6 hrs Rand cogniti… cogn… digitsB… 8 <NA> 2006 R8BWC20 BACKWARDS … "0.I… no
## 7 hrs Rand cogniti… cogn… digitsB… 9 <NA> 2008 R9BWC20 BACKWARDS … "0.I… no
## 8 hrs Rand cogniti… cogn… digitsB… 10 <NA> 2010 R10BWC20 BACKWARDS … "0.I… no
## 9 hrs Rand cogniti… cogn… digitsB… 11 <NA> 2012 R11BWC20 BACKWARDS … "0.I… no
## 10 hrs Rand cogniti… cogn… digitsB… 12 <NA> 2014 R12BWC20 BACKWARDS … "0.I… no
## # ℹ 692 more rows
## # ℹ 5 more variables: recode <chr>, mini <dbl>, maxi <dbl>, comp_rule <chr>, long_rule <chr>
old.names <- unique(hrs_codebook$orig_itemname)
hrs.paq <- tibble(
year = sprintf("%s/hrs", data_path) %>% list.files(., pattern = "^[0-9]")
, data = map(year, hrs_read_fun)
, names = map(data, colnames)
) %>%
filter(!is.na(data))
old.names <- unique((hrs_codebook %>% filter(dataset == "Rand"))$orig_itemname)
hrs.rand <- sprintf("%s/hrs/randhrs1992_2016v1.sav", data_path) %>%
haven::read_sav(.) %>%
haven::zap_labels(.) %>%
select(SID = HHIDPN, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, na.rm = T)
hrs_long <- hrs.paq %>%
mutate(data = map(data, ~(.) %>%
pivot_longer(cols = c(-HHID, -PN)
, names_to = "orig_itemname"
, values_to = "value"
, values_drop_na = TRUE))) %>%
select(-names, -year) %>%
unnest(data) %>%
mutate(SID = 1000*as.numeric(HHID) + as.numeric(PN)) %>%
select(-PN, -HHID)
hrs_dem <- sprintf("%s/hrs/pdem_withvarnames.sas7bdat", data_path) %>%
haven::read_sas(.) %>%
select(SID = hhidpn, year = prediction_year, value = prob_dementia) %>%
mutate(orig_itemname = "PROB_DEMENTIA")
hrs.subs <- unique(hrs_long$SID)[unique(hrs_long$SID) %in% unique(hrs.rand$SID)]
hrs_long <- hrs_long %>%
bind_rows(hrs.rand %>% select(orig_itemname, value, SID)) %>%
filter(SID %in% hrs.subs)
save(hrs.rand, hrs.paq, file = sprintf("%s/data/clean/hrs_raw.RData", local_path))
rm(list = c("hrs.paq", "hrs.rand"))
2.1.2 Recoding & Reverse Scoring
hrs_waves <- p_waves %>% filter(Study == "HRS") %>% select(Used) %>% distinct()
# join data with recoding info
hrs_recode <- hrs_codebook %>%
filter(category %in% c("pers", "outcome", "covariates", "cognition") & orig_itemname != "prob_dementia") %>%
select(category, name, itemname, wave, year, orig_itemname, reverse_code:long_rule) %>%
group_by(category, name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(hrs_long)))
hrs_recode <- hrs_dem %>%
left_join(
hrs_codebook %>%
select(category, name, itemname, orig_itemname, reverse_code:long_rule)
) %>%
group_by(category, name) %>%
nest() %>%
ungroup() %>%
bind_rows(hrs_recode)
# recode
recode_fun <- function(rule, y, year, p_year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
hrs_recode <- hrs_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(year = mapvalues(year, seq(2006, 2016, 2), rep(c(2006, 2010, 2014), each = 2)),
p_year = 2006) %>%
group_by(recode, year, p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year, p_year), recode_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))
# reverse code
hrs_recode <- hrs_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
select = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
2.1.3 Covariates
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, name, year, p_year, long_rule) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
# composite WITHIN years
hrs_cov <- hrs_recode %>%
filter(category == "covariates") %>%
select(-category) %>%
unnest(data) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(d, rule, p_year){
d %>%
filter(year <= p_year) %>%
group_by(SID, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
distinct()
}
# composite ACROSS years
hrs_cov <- hrs_cov %>%
mutate(long_rule = ifelse(is.na(long_rule), "skip", long_rule)) %>%
group_by(p_year, long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(data, long_rule, p_year), comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-long_rule) %>%
spread(name, value) %>%
filter(!is.na(SID))
## # A tibble: 28,726 × 19
## p_year SID alcohol BMI cancer diabetes education exercise gender heartProb height married race
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2006 3010 1 28.8 0 0 12 2.02 0 1 165. 1 0
## 2 2006 3020 1 28.8 1 0 16 1.77 1 0 165. 1 0
## 3 2006 10001010 0 23.5 0 0 12 1.88 0 0 183. 0 0
## 4 2006 10003030 0 28.5 0 0 16 1.25 1 1 157. 1 0
## 5 2006 10004010 1 27.0 1 0 16 1.75 0 0 185. 1 0
## 6 2006 10004040 1 27.0 0 0 12 2 1 0 165. 1 0
## 7 2006 10013010 0 28.2 0 1 12 1.57 0 1 177. 1 0
## 8 2006 10013040 1 24.5 0 0 13 1.5 1 0 160. 1 0
## 9 2006 10038010 1 23.3 0 0 16 1.75 0 1 177. 1 0
## 10 2006 10038040 1 23.3 0 0 16 1.78 1 0 170. 1 0
## # ℹ 28,716 more rows
## # ℹ 6 more variables: respDis <dbl>, smokes <dbl>, SRhealth <dbl>, stroke <dbl>, weight <dbl>,
## # yearBrth <dbl>
2.1.4 Personality Variables
hrs_pers <- hrs_recode %>%
filter(category == "pers") %>%
select(-category) %>%
unnest(data) %>%
filter(year == "2006" & !is.na(value)) %>%
group_by(SID, p_year, year, name, itemname, comp_rule) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup()
# alpha's
hrs_alpha <- hrs_pers %>%
filter(!is.na(value)) %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% distinct() %>% pivot_wider(names_from = itemname, values_from = value, values_fn = list(mean))),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
# create composites
hrs_pers <- hrs_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, comp_fun)) %>%
unnest(data) %>%
select(-comp_rule)
## # A tibble: 116,915 × 4
## year name SID value
## <dbl> <chr> <dbl> <dbl>
## 1 2006 A 3010 3
## 2 2006 A 3020 3
## 3 2006 A 10001010 4
## 4 2006 A 10003030 3.4
## 5 2006 A 10004010 3.4
## 6 2006 A 10004040 3.6
## 7 2006 A 10013010 2
## 8 2006 A 10013040 2.6
## 9 2006 A 10038010 3.4
## 10 2006 A 10038040 2.6
## # ℹ 116,905 more rows
2.1.5 Cognition Variables
hrs_cog <- hrs_recode %>%
filter(category == "cognition") %>%
select(-category) %>%
unnest(data) %>%
filter(year == p_year) %>%
filter(!is.na(value)) %>%
group_by(name, itemname, year) %>%
mutate(value = pomp(value)) %>%
group_by(SID, name) %>%
summarize(value = mean(value)) %>%
ungroup() %>%
pivot_wider(names_from = "name", values_from = "value")
## # A tibble: 16,778 × 2
## SID cognition
## <dbl> <dbl>
## 1 3010 7.5
## 2 3020 8.08
## 3 10001010 8.75
## 4 10003030 6
## 5 10004010 7.92
## 6 10004040 8.83
## 7 10013010 7.25
## 8 10013040 8.67
## 9 10038010 8.58
## 10 10038040 8.25
## # ℹ 16,768 more rows
2.1.6 Outcome Variables
# composite within years
# compositing within years
hrs_out <- hrs_recode %>%
filter(category == "outcome") %>%
unnest(data) %>%
group_by(SID, name, year, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value),
group = ifelse(year > p_year, "future", "past")) %>%
filter(!is.na(value)) %>%
group_by(SID, p_year, year, name, group) %>%
mutate(value = ifelse(value < .5, 0, 1)
, o_year = max(year[!is.na(value)])) %>%
group_by(SID, p_year, name, group, o_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name, o_year) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA)))) %>%
group_by(SID, p_year, name) %>%
mutate(o_year = max(o_year)) %>%
group_by(SID, p_year, o_year, name) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
ungroup()
## # A tibble: 27,187 × 5
## SID p_year o_year name value
## <dbl> <dbl> <dbl> <chr> <dbl>
## 1 3010 2006 2010 dementia 1
## 2 3020 2006 2014 dementia 0
## 3 10001010 2006 2014 dementia 0
## 4 10003030 2006 2014 dementia 0
## 5 10004010 2006 2010 dementia 0
## 6 10004040 2006 2014 dementia 0
## 7 10013010 2006 2014 dementia 1
## 8 10013040 2006 2014 dementia 0
## 9 10038010 2006 2014 dementia 0
## 10 10038040 2006 2014 dementia 0
## # ℹ 27,177 more rows
2.1.7 Combine Data
hrs_combined <- hrs_pers %>%
rename(Trait = name, p_value = value, p_year = year) %>%
full_join(hrs_out %>% select(SID, Outcome = name, o_value = value, o_year)) %>%
full_join(hrs_cov) %>%
left_join(
hrs_out %>%
filter(name == "dementia") %>%
pivot_wider(names_from = "name", values_from = "value")
) %>%
full_join(hrs_cog) %>%
filter(!is.na(p_value) & !is.na(o_value)) %>%
mutate(age = p_year - yearBrth)
## # A tibble: 112,097 × 27
## p_year Trait SID p_value Outcome o_value o_year alcohol BMI cancer diabetes education exercise
## <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2006 A 3010 3 dementia 1 2010 1 28.8 0 0 12 2.02
## 2 2006 A 3020 3 dementia 0 2014 1 28.8 1 0 16 1.77
## 3 2006 A 10001010 4 dementia 0 2014 0 23.5 0 0 12 1.88
## 4 2006 A 10003030 3.4 dementia 0 2014 0 28.5 0 0 16 1.25
## 5 2006 A 10004010 3.4 dementia 0 2010 1 27.0 1 0 16 1.75
## 6 2006 A 10004040 3.6 dementia 0 2014 1 27.0 0 0 12 2
## 7 2006 A 10013010 2 dementia 1 2014 0 28.2 0 1 12 1.57
## 8 2006 A 10013040 2.6 dementia 0 2014 1 24.5 0 0 13 1.5
## 9 2006 A 10038010 3.4 dementia 0 2014 1 23.3 0 0 16 1.75
## 10 2006 A 10038040 2.6 dementia 0 2014 1 23.3 0 0 16 1.78
## # ℹ 112,087 more rows
## # ℹ 14 more variables: gender <dbl>, heartProb <dbl>, height <dbl>, married <dbl>, race <dbl>,
## # respDis <dbl>, smokes <dbl>, SRhealth <dbl>, stroke <dbl>, weight <dbl>, yearBrth <dbl>,
## # dementia <dbl>, cognition <dbl>, age <dbl>
2.2 RUSH Memory and and Aging Project (RUSH-MAP)
The RUSH Memory and Aging Project (RUSH-MAP) is an ongoing longitudinal study that began in 1997 (a2012overview?). These data are available, through application from https://www.radc.rush.edu/requests.htm.
Participants who were 65 and older were recruited from retirement communities and subsidized senior housing facilities throughout Chicagoland and northeastern Illinois beginning in 1997. Data are collected annually, and all participants are organ donors. Additional participants are recuited each year. Additional information and documentation on the data can be found at https://www.radc.rush.edu/docs/var/variables.htm.
Sample sizes vary by year, ranging from 52 (1997) to 2205 participants including 884 deceased participants with autopsy data (2019, 2020). This provides 99% power to detect a zero-order correlation effect size of ~.10, two-tailed at alpha .05.
2.2.1 Load Data
(map_codebook <- (codebook %>% filter(study == "RADC-MAP"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_lower(orig_itemname)))
## # A tibble: 44 × 15
## study dataset category name itemname year orig_itemname description scale reverse_code recode mini
## <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 RADC-MAP NA cognition cogn… freeRec… long… cts_wlii word list … <NA> no ifels… NA
## 2 RADC-MAP NA cognition cogn… cuedRec… long… cts_wliii word list … <NA> no ifels… NA
## 3 RADC-MAP NA cognition cogn… digitsF… long… cts_df digits bac… <NA> no ifels… NA
## 4 RADC-MAP NA cognition cogn… digitsB… long… cts_db digits for… <NA> no ifels… NA
## 5 RADC-MAP NA cognition cogn… catFlue… long… cts_catflu category f… <NA> no ifels… NA
## 6 RADC-MAP NA cognition cogn… bosNami… long… cts_bname Boston nam… <NA> no ifels… NA
## 7 RADC-MAP NA cognition cogn… progMat long… cts_pmat progressiv… <NA> no ifels… NA
## 8 RADC-MAP NA cognition cogn… digitSy… long… cts_sdmt symbol dig… <NA> no ifels… NA
## 9 RADC-MAP NA covariat… age ageBase… base… age_bl The age at… <NA> no ifels… NA
## 10 RADC-MAP NA covariat… alco… alcohol base… alcohol_g_bl Grams of a… "\r\… no ifels… NA
## # ℹ 34 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>
map <- sprintf("%s/rush-radc/dataset_1033_long_03-24-2021.xlsx", data_path) %>% read_excel() %>%
full_join(sprintf("%s/rush-radc/dataset_1033_basic_03-24-2021.xlsx", data_path) %>% read_excel()) %>%
full_join(sprintf("%s/rush-radc/dataset_1034_long_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
full_join(sprintf("%s/rush-radc/dataset_1034_basic_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
filter(study == "MAP")
2.2.2 Recoding & Reverse Scoring
rename_fun <- function(cb, var){
print(var)
old.names <- unique((map_codebook %>% filter(name == var))$orig_itemname)
df <- map %>%
select(SID = projid, wave = fu_year, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, -wave, na.rm = T) %>%
left_join(cb %>% select(itemname, orig_itemname, reverse_code:long_rule))
}
# join data with recoding info
map_recode <- map_codebook %>%
select(category, name, itemname, year, orig_itemname, reverse_code:long_rule) %>%
filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
group_by(category, name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun))
# recode
recode_fun <- function(rule, y, year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
map_recode <- map_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(wave = as.numeric(wave)) %>%
group_by(recode, wave) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, wave), recode_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))
# reverse code
map_recode <- map_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(value = as.numeric(value),
value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
select = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
2.2.3 Covariates
# composite WITHIN years
map_cov <- map_recode %>%
filter(category == "covariates") %>%
select(-category) %>%
mutate(data = map(data, ~(.) %>%
filter(!is.na(value)) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
group_by(comp_rule, SID, wave, long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, ~fun_call((.x)$value, .y))) %>%
unnest(data) %>%
filter(wave == 0)
))
comp_fun <- function(d, rule){
d %>%
group_by(SID, name) %>%
summarize(value = fun_call(data, rule)) %>%
ungroup() %>%
distinct()
}
# composite ACROSS years
map_cov <- map_cov %>%
unnest(data) %>%
mutate(long_rule = ifelse(is.na(long_rule), "skip", long_rule)) %>%
group_by(long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(data, long_rule), comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-long_rule) %>%
spread(name, value) %>%
filter(!is.na(SID))
## # A tibble: 2,192 × 16
## SID age alcohol BMI cancer diabetes education exercise gender heartProb married mmse parkinsons
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00009121 80.0 1 26.8 0 0 12 1 0 0 1 29 0
## 2 00033027 81.0 0 32.6 0 1 14 0 0 0 1 29 0
## 3 00045071 87.5 1 28.2 0 1 16 1 1 1 1 18 0
## 4 00130005 89.8 0 27.8 0 0 15 1 0 1 1 29 0
## 5 00204228 65.2 0 36.6 0 1 8 1 1 0 1 27 0
## 6 00228190 73.5 1 23.0 1 0 22 1 0 0 1 29 0
## 7 00246264 90.0 0 24.0 0 0 16 1 0 0 1 27 0
## 8 00285563 84.7 0 27.0 0 0 12 1 0 0 1 28 0
## 9 00402800 78.7 0 17.2 0 0 16 1 0 0 1 17 0
## 10 00482428 81.4 1 NA 1 0 12 1 0 0 1 30 0
## # ℹ 2,182 more rows
## # ℹ 3 more variables: race <dbl>, smokes <dbl>, stroke <dbl>
2.2.4 Personality Variables
map_pers <- map_recode %>%
filter(category == "pers") %>%
select(-category) %>%
unnest(data) %>%
distinct() %>%
group_by(SID, name) %>%
filter(wave == min(wave)) %>%
ungroup() %>%
select(SID, name, wave, value)
## # A tibble: 9,115 × 4
## SID name wave value
## <chr> <chr> <dbl> <dbl>
## 1 00009121 C 0 35
## 2 00045071 C 0 38
## 3 00130005 C 0 31
## 4 00246264 C 0 33
## 5 00402800 C 0 24
## 6 00582981 C 0 37
## 7 00617643 C 0 34
## 8 00696418 C 0 27
## 9 00701662 C 0 31
## 10 00709354 C 0 40
## # ℹ 9,105 more rows
2.2.5 Outcome Variables
map_out_waves <- map_recode %>%
filter(category == "outcome") %>%
unnest(data) %>%
group_by(SID, name) %>%
summarize(o_year = max(wave)) %>%
ungroup()
map_out <- map_recode %>%
filter(category == "outcome") %>%
select(-category) %>%
unnest(data)
map_dem <- map_out %>%
filter(name == "dementia") %>%
group_by(SID, name, wave) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value),
group = ifelse(wave > 0, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA)))) %>%
ungroup()
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
# create composites
map_out <- map_out %>%
filter(wave > 0 & name != "dementia") %>%
group_by(name, long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, long_rule, comp_fun)) %>%
unnest(data) %>%
select(-long_rule) %>%
full_join(map_dem) %>%
left_join(map_out_waves)
## # A tibble: 11,128 × 6
## name SID value future past o_year
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ageDementia 00045071 88.5 NA NA 1
## 2 ageDementia 00285563 90.5 NA NA 13
## 3 ageDementia 00668310 88.6 NA NA 5
## 4 ageDementia 01243685 89.7 NA NA 3
## 5 ageDementia 01797756 86.4 NA NA 11
## 6 ageDementia 02108769 90.9 NA NA 8
## 7 ageDementia 03227207 97.8 NA NA 17
## 8 ageDementia 03380931 85.7 NA NA 7
## 9 ageDementia 03806878 92.4 NA NA 5
## 10 ageDementia 04330337 101. NA NA 15
## # ℹ 11,118 more rows
2.2.6 Cognition Variables
# composite within years
map_cog <- map_recode %>%
filter(category == "cognition") %>%
select(-category) %>%
unnest(data) %>%
filter(wave == 0) %>%
filter(!is.na(value)) %>%
group_by(name, itemname, wave) %>%
mutate(value = pomp(value)) %>%
group_by(name, SID) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider(names_from = "name"
, values_from = "value")
## # A tibble: 2,189 × 2
## SID cognition
## <chr> <dbl>
## 1 00009121 7.55
## 2 00033027 6.17
## 3 00045071 4.84
## 4 00130005 6.59
## 5 00204228 5.81
## 6 00228190 7.38
## 7 00246264 6.16
## 8 00285563 5.79
## 9 00402800 4.14
## 10 00482428 6.58
## # ℹ 2,179 more rows
2.2.7 Combine Data
map_combined <- map_pers %>%
rename(Trait = name, p_value = value, p_year = wave) %>%
full_join(
map_out %>%
select(Outcome = name, SID, o_value = value, o_year)
) %>%
full_join(map_cov) %>%
left_join(
map_out %>%
filter(name == "dementia") %>%
mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
select(-future, -past) %>%
pivot_wider(names_from = "name", values_from = "value")
) %>%
full_join(map_cog) %>%
filter(!is.na(p_value) & !is.na(o_value))
## # A tibble: 42,999 × 24
## SID Trait p_year p_value Outcome o_value o_year age alcohol BMI cancer diabetes education exercise
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00009… C 0 35 dement… 0 10 80.0 1 26.8 0 0 12 1
## 2 00045… C 0 38 ageDem… 88.5 1 87.5 1 28.2 0 1 16 1
## 3 00045… C 0 38 dement… 1 1 87.5 1 28.2 0 1 16 1
## 4 00130… C 0 31 dement… 0 4 89.8 0 27.8 0 0 15 1
## 5 00246… C 0 33 angiop… 2 8 90.0 0 24.0 0 0 16 1
## 6 00246… C 0 33 arteri… 2 8 90.0 0 24.0 0 0 16 1
## 7 00246… C 0 33 athero… 1 8 90.0 0 24.0 0 0 16 1
## 8 00246… C 0 33 braak 3 8 90.0 0 24.0 0 0 16 1
## 9 00246… C 0 33 cerad 2 8 90.0 0 24.0 0 0 16 1
## 10 00246… C 0 33 hipScl… 0 8 90.0 0 24.0 0 0 16 1
## # ℹ 42,989 more rows
## # ℹ 10 more variables: gender <dbl>, heartProb <dbl>, married <dbl>, mmse <dbl>, parkinsons <dbl>,
## # race <dbl>, smokes <dbl>, stroke <dbl>, dementia <dbl>, cognition <dbl>
2.3 RUSH Religious Orders Study (ROS)
The RUSH Religious Orders Study (ROS) is an ongoing longitudinal study that began in 1994 (a2012overview?). These data are available, through application from https://www.radc.rush.edu/requests.htm.
Older (65 and above) Catholic nuns, priests, and brothers with no prior dementia diagnosis and who agreed to annual evaluations and eventual organ donation were recruited from more than 40 groups across the United States. Additional participants are recuited each year. Additional information and documentation on the data can be found at https://www.radc.rush.edu/docs/var/variables.htm.
Sample sizes vary bt year from 353 participants (1994) to 1487 participants, including 797 deceased participants with autopsy data (2019, 2020). This provides 99% power to detect a zero-order correlation effect size of ~.11, two-tailed at alpha .05.
(ros_codebook <- (codebook %>% filter(study == "ROS"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_lower(orig_itemname)))
## # A tibble: 42 × 15
## study dataset category name itemname year orig_itemname description scale reverse_code recode mini
## <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 ROS NA cognition cognit… freeRec… long… cts_wlii word list … <NA> no ifels… NA
## 2 ROS NA cognition cognit… cuedRec… long… cts_wliii word list … <NA> no ifels… NA
## 3 ROS NA cognition cognit… digitsF… long… cts_df digits bac… <NA> no ifels… NA
## 4 ROS NA cognition cognit… digitsB… long… cts_db digits for… <NA> no ifels… NA
## 5 ROS NA cognition cognit… catFlue… long… cts_catflu category f… <NA> no ifels… NA
## 6 ROS NA cognition cognit… bosNami… long… cts_bname Boston nam… <NA> no ifels… NA
## 7 ROS NA cognition cognit… progMat long… cts_pmat progressiv… <NA> no ifels… NA
## 8 ROS NA cognition cognit… digitSy… long… cts_sdmt symbol dig… <NA> no ifels… NA
## 9 ROS NA covariates age ageBase… base… age_bl The age at… <NA> no ifels… NA
## 10 ROS NA covariates alcohol alcohol base… alcohol_g_bl Grams of a… "\r\… no ifels… NA
## # ℹ 32 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>
ros <- sprintf("%s/rush-radc/dataset_1033_long_03-24-2021.xlsx", data_path) %>% read_excel() %>%
full_join(sprintf("%s/rush-radc/dataset_1033_basic_03-24-2021.xlsx", data_path) %>% read_excel()) %>%
full_join(sprintf("%s/rush-radc/dataset_1034_long_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
full_join(sprintf("%s/rush-radc/dataset_1034_basic_03-25-2021.xlsx", data_path) %>% read_excel()) %>%
filter(study == "ROS")
2.3.1 Recoding & Reverse Scoring
rename_fun <- function(cb, var){
print(var)
old.names <- unique((ros_codebook %>% filter(name == var))$orig_itemname)
df <- ros %>%
select(SID = projid, wave = fu_year, one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -SID, -wave, na.rm = T) %>%
left_join(cb %>% select(itemname, orig_itemname, reverse_code:long_rule))
}
# join data with recoding info
ros_recode <- ros_codebook %>%
select(category, name, itemname, year, orig_itemname, reverse_code:long_rule) %>%
filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
group_by(category, name) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, name, rename_fun))
# recode
recode_fun <- function(rule, y, year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
ros_recode <- ros_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(wave = as.numeric(wave)) %>%
group_by(recode, wave) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, wave), recode_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))
# reverse code
ros_recode <- ros_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(value = as.numeric(value),
value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
select = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
2.3.2 Covariates
# composite WITHIN years
ros_cov <- ros_recode %>%
filter(category == "covariates") %>%
select(-category) %>%
mutate(data = map(data, ~(.) %>%
filter(!is.na(value)) %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)) %>%
group_by(comp_rule, SID, wave, long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, ~fun_call((.x)$value, .y))) %>%
unnest(data) %>%
filter(wave == 0)
))
comp_fun <- function(d, rule){
d %>%
group_by(SID, name) %>%
summarize(value = fun_call(data, rule)) %>%
ungroup() %>%
distinct()
}
# composite ACROSS years
ros_cov <- ros_cov %>%
unnest(data) %>%
mutate(long_rule = ifelse(is.na(long_rule), "skip", long_rule)) %>%
group_by(long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(data, long_rule), comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-long_rule) %>%
spread(name, value) %>%
filter(!is.na(SID))
## # A tibble: 1,485 × 14
## SID age alcohol BMI cancer diabetes education exercise gender heartProb mmse race smokes stroke
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 000210… 80.0 0 19.9 0 0 22 1 0 1 18 0 0 0
## 2 003377… 82.2 0 42.8 0 0 22 1 1 0 27 0 0 0
## 3 003811… 70.3 0 29.7 0 0 20 0 0 0 30 0 0 NA
## 4 004702… 72.6 1 20.0 1 0 18 1 0 0 30 0 0 0
## 5 007567… 87.3 0 33.2 0 1 20 0 1 0 23 1 1 0
## 6 009850… 78.6 0 23.7 1 0 21 1 0 0 28 0 0 0
## 7 012114… 85.2 0 25.1 0 0 12 0 1 0 24 0 1 0
## 8 012370… 72.0 0 18.8 0 0 16 1 0 0 29 0 0 0
## 9 016795… 65.2 1 36.4 0 0 22 1 0 0 30 0 0 0
## 10 021057… 85.2 1 21.5 0 0 18 1 0 0 29 0 0 0
## # ℹ 1,475 more rows
2.3.3 Personality Variables
ros_pers <- ros_recode %>%
filter(category == "pers") %>%
select(-category) %>%
unnest(data) %>%
distinct() %>%
group_by(SID, name) %>%
filter(wave == min(wave)) %>%
ungroup() %>%
select(SID, name, wave, value)
## # A tibble: 7,569 × 4
## SID name wave value
## <chr> <chr> <dbl> <dbl>
## 1 00021073 A 0 32
## 2 00337708 A 0 30
## 3 00381112 A 0 36
## 4 00470212 A 0 41
## 5 00756793 A 0 27
## 6 00985084 A 0 33
## 7 01211411 A 0 27
## 8 01237015 A 0 34
## 9 01679543 A 0 43
## 10 02105734 A 0 36
## # ℹ 7,559 more rows
2.3.4 Outcome Variables
ros_out_waves <- ros_recode %>%
filter(category == "outcome") %>%
unnest(data) %>%
group_by(SID, name) %>%
summarize(o_year = max(wave)) %>%
ungroup()
ros_out <- ros_recode %>%
filter(category == "outcome") %>%
select(-category) %>%
unnest(data)
ros_dem <- ros_out %>%
filter(name == "dementia") %>%
group_by(SID, name, wave) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value) | is.infinite(value), NA, value),
group = ifelse(wave > 0, "future", "past")) %>%
group_by(SID, name, group) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA)))) %>%
ungroup()
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
# create composites
ros_out <- ros_out %>%
filter(wave > 0 & name != "dementia") %>%
group_by(name, long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, long_rule, comp_fun)) %>%
unnest(data) %>%
select(-long_rule) %>%
full_join(ros_dem) %>%
left_join(ros_out_waves)
## # A tibble: 9,581 × 6
## name SID value future past o_year
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ageDementia 02495739 92.5 NA NA 13
## 2 ageDementia 07299965 80.0 NA NA 13
## 3 ageDementia 07660182 83.7 NA NA 4
## 4 ageDementia 10100150 84.5 NA NA 12
## 5 ageDementia 10100286 80.7 NA NA 18
## 6 ageDementia 10101039 90.4 NA NA 8
## 7 ageDementia 10101589 107. NA NA 6
## 8 ageDementia 10101741 92.4 NA NA 12
## 9 ageDementia 10116694 82.2 NA NA 5
## 10 ageDementia 10200901 96.2 NA NA 23
## # ℹ 9,571 more rows
2.3.5 Cognition Variables
# composite within years
ros_cog <- ros_recode %>%
filter(category == "cognition") %>%
select(-category) %>%
unnest(data) %>%
filter(wave == 0) %>%
filter(!is.na(value)) %>%
group_by(name, itemname, wave) %>%
mutate(value = pomp(value)) %>%
group_by(name, SID) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider(names_from = "name"
, values_from = "value")
## # A tibble: 1,485 × 2
## SID cognition
## <chr> <dbl>
## 1 00021073 4.19
## 2 00337708 5.89
## 3 00381112 7.02
## 4 00470212 6.68
## 5 00756793 4.35
## 6 00985084 6.83
## 7 01211411 3.83
## 8 01237015 5.64
## 9 01679543 7.23
## 10 02105734 6.64
## # ℹ 1,475 more rows
2.3.6 Combine Data
ros_combined <- ros_pers %>%
rename(Trait = name, p_value = value, p_year = wave) %>%
full_join(
ros_out %>%
select(Outcome = name, SID, o_value = value, o_year)
) %>%
full_join(ros_cov) %>%
left_join(
ros_out %>%
filter(name == "dementia") %>%
mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
select(-future, -past) %>%
pivot_wider(names_from = "name", values_from = "value")
) %>%
full_join(ros_cog) %>%
filter(!is.na(p_value) & !is.na(o_value))
## # A tibble: 48,074 × 22
## SID Trait p_year p_value Outcome o_value o_year age alcohol BMI cancer diabetes education exercise
## <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00021… A 0 32 angiop… 3 2 80.0 0 19.9 0 0 22 1
## 2 00021… A 0 32 arteri… 0 2 80.0 0 19.9 0 0 22 1
## 3 00021… A 0 32 athero… 2 2 80.0 0 19.9 0 0 22 1
## 4 00021… A 0 32 braak 6 2 80.0 0 19.9 0 0 22 1
## 5 00021… A 0 32 cerad 1 2 80.0 0 19.9 0 0 22 1
## 6 00021… A 0 32 hipScl… 0 2 80.0 0 19.9 0 0 22 1
## 7 00021… A 0 32 lewyBo… 0 2 80.0 0 19.9 0 0 22 1
## 8 00021… A 0 32 vsclrI… 0 2 80.0 0 19.9 0 0 22 1
## 9 00021… A 0 32 vsclrM… 1 2 80.0 0 19.9 0 0 22 1
## 10 00021… A 0 32 tdp43 1 2 80.0 0 19.9 0 0 22 1
## # ℹ 48,064 more rows
## # ℹ 8 more variables: gender <dbl>, heartProb <dbl>, mmse <dbl>, race <dbl>, smokes <dbl>, stroke <dbl>,
## # dementia <dbl>, cognition <dbl>
2.4 Swedish Adoption Twin Study of Aging (SATSA)
The Swedish Adoption Twin Study of Aging (SATSA) is a longitudinal study of twin pairs from the Swedish Twin Registry that began in 1984. Data are available through the ICPSR database at https://www.icpsr.umich.edu/web/ICPSR/studies/3843.
All twin-pairs on the Swedish Twin Registry who were separated at an early age were invited to be a part of the study in 1984. A control sample of twins reared together were also included. Additional waves of all participants were collected in 1987, 1990, 1993, 2004, 2007, 2010, 2012, and 2014. More information, including codebooks, scales, and variable search functions can be found at https://www.maelstrom-research.org/mica/individual-study/satsa/#.
Sample sizes vary by wave, ranging from 2018 participants at baseline (1984) to 379 participants (IPT7). Given that the target measures were collected at baseline, this provides 99% power to detect a zero-order correlation effect size of ~.10, two-tailed at alpha .05.
2.4.1 Load Data
satsa_read_fun <- function(x){
prob_vars <- c("FHEART", "FPARKIN", "FSTROKE")
y <- sprintf("%s/satsa/%s", data_path, x) %>% haven::read_sav(.) %>%
select(SID = TWINNR, one_of(old.names)) %>%
as_tibble() %>%
haven::zap_labels(.)
if(any(prob_vars %in% colnames(y))){
y <- y %>% mutate_at(vars(one_of(prob_vars)), ~as.numeric(as.character(.)))
}
return(y)
}
satsa_codebook <- (codebook %>% filter(study == "SATSA"))$codebook[[1]] %>%
mutate_at(vars(orig_itemname), str_to_upper)
satsa_codebook
## # A tibble: 840 × 17
## study dataset category name itemname wave_letter year item_stem orig_itemname description scale
## <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 satsa SATSA_IPT1 cognition cognition blockDe… <NA> 1985 <NA> IBLOC_R1 Block Desi… inte…
## 2 satsa SATSA_IPT2 cognition cognition blockDe… <NA> 1989 <NA> IBLOC_R2 Block Desi… inte…
## 3 satsa SATSA_IPT3 cognition cognition blockDe… <NA> 1992 <NA> IBLOC_R3 Block Desi… inte…
## 4 satsa SATSA_IPT4 cognition cognition blockDe… <NA> 1995 <NA> IBLOC_R4 Block Desi… inte…
## 5 satsa SATSA_IPT5 cognition cognition blockDe… <NA> 1999 <NA> IBLOC_R5 Block Desi… inte…
## 6 satsa SATSA_IPT6 cognition cognition blockDe… <NA> 2002 <NA> IBLOC_R6 Block Desi… inte…
## 7 satsa SATSA_IPT7 cognition cognition blockDe… <NA> 2005 <NA> IBLOC_R7 Block Desi… inte…
## 8 satsa SATSA_IPT1 cognition cognition digitSp… <NA> 1985 <NA> IDGSP_R1 Digit Span… inte…
## 9 satsa SATSA_IPT2 cognition cognition digitSp… <NA> 1989 <NA> IDGSP_R2 Digit Span… inte…
## 10 satsa SATSA_IPT3 cognition cognition digitSp… <NA> 1992 <NA> IDGSP_R3 Digit Span… inte…
## # ℹ 830 more rows
## # ℹ 6 more variables: reverse_code <chr>, recode <chr>, mini <dbl>, maxi <dbl>, comp_rule <chr>,
## # long_rule <chr>
old.names <- unique(satsa_codebook$orig_itemname) %>% str_to_upper
datasets <- sprintf("%s/satsa", data_path) %>% list.files(., pattern = ".sav")
satsa <- tibble(datasets = datasets) %>%
mutate(data = map(datasets, satsa_read_fun),
ncol = map_dbl(data, ncol)) %>%
filter(ncol != 0)
satsa <- reduce(satsa$data, full_join)
satsa <- satsa %>%
mutate_if(is.factor, ~as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", .)))
satsa_long <- satsa %>%
pivot_longer(
names_to = "orig_itemname"
, values_to = "value"
, cols = -SID
# , values_drop_na = T
)
save(satsa, file = sprintf("%s/data/clean/satsa_raw.RData", load_path))
rm(satsa)
2.4.2 Recoding & Reverse Scoring
satsa_waves <- p_waves %>% filter(Study == "SATSA") %>% select(Used) %>% distinct()
# join data with recoding info
satsa_recode <- satsa_codebook %>%
select(category, name, itemname, year, orig_itemname, reverse_code:long_rule) %>%
filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
group_by(category, name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(satsa_long)))
# recode
recode_fun <- function(rule, y, year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
satsa_recode <- satsa_recode %>%
mutate(data = map(data, ~(.) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value)) %>%
filter(!is.na(value))))
# reverse code
satsa_recode <- satsa_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
select = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
2.4.3 Covariates
satsa_cov <- satsa_recode %>% filter(category == "covariates")
# bring in year or birth for cleaning
yrBrth <- satsa_cov %>%
filter(name == "yearBrth") %>%
unnest(data) %>%
group_by(SID) %>%
summarize(yearBrth = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth))
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, name, year, long_rule) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
satsa_waves <- p_waves %>% filter(Study == "SATSA") %>% select(Used) %>% distinct()
satsa_cov <- satsa_cov %>%
unnest(data) %>%
filter(year <= max(satsa_waves$Used) + 1) %>%
mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(d, rule){
d %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
satsa_cov <- satsa_cov %>%
group_by(name, long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, long_rule, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-long_rule) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>%
mutate(BMI = weight/((height/100)^2))
## # A tibble: 3,840 × 18
## SID smokes alcohol cancer diabetes heartProb married parkinsons respProb stroke gender education mmse
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 181 0 NA NA NA NA NA NA NA NA 0 NA NA
## 2 182 0 NA NA NA NA NA NA NA NA 0 NA NA
## 3 1101 0 NA NA NA NA NA NA NA NA 0 NA NA
## 4 1102 0 NA NA NA NA NA NA NA NA 0 NA NA
## 5 1121 0 NA NA NA NA NA NA NA NA 1 NA NA
## 6 1122 0 NA NA NA NA NA NA NA NA 1 NA NA
## 7 1151 0 NA NA NA NA NA NA NA NA 1 NA NA
## 8 1152 0 NA NA NA NA NA NA NA NA 1 NA NA
## 9 1211 0 NA NA NA NA NA NA NA NA 1 NA NA
## 10 1212 0 NA NA NA NA NA NA NA NA 1 NA NA
## # ℹ 3,830 more rows
## # ℹ 5 more variables: yearBrth <dbl>, SRhealth <dbl>, height <dbl>, weight <dbl>, BMI <dbl>
2.4.4 Personality Variables
satsa_pers <- satsa_recode %>%
filter(category == "pers") %>%
unnest(data) %>%
left_join(p_waves %>% filter(Study == "SATSA") %>% select(name = p_item, Used)) %>%
filter(year %in% Used & !is.na(value)) %>%
distinct()
# alpha's
satsa_alpha <- satsa_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% spread(itemname, value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID)), NA_real_)))
# create composites
satsa_pers <- satsa_pers %>%
group_by(SID, name, year) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup()
## # A tibble: 14,109 × 4
## SID name year value
## <dbl> <chr> <dbl> <dbl>
## 1 1292 A 1984 4
## 2 1292 C 1984 4
## 3 1292 E 1984 3.67
## 4 1292 N 1984 4.33
## 5 1292 O 1984 3.5
## 6 1292 SWL 1984 2.54
## 7 1701 C 1984 5
## 8 1701 E 1984 3.89
## 9 1701 N 1984 4
## 10 1701 O 1984 1
## # ℹ 14,099 more rows
2.4.5 Outcome Variables
satsa_out <- satsa_recode %>%
filter(category == "outcome") %>%
unnest(data) %>%
distinct() %>%
full_join(crossing(p_year = satsa_waves$Used, name = unique((.)$name)))
satsa_out_waves <- satsa_out %>%
group_by(SID, name) %>%
summarize(o_year = max(year[!is.na(value)])) %>%
ungroup()
satsa_waves <- p_waves %>% filter(Study == "SATSA") %>% select(Used) %>% distinct()
satsa_out <- satsa_out %>%
filter(year > p_year) %>%
group_by(SID, name, year, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.nan(value)|is.infinite(value), NA, value)) %>%
group_by(SID, p_year, name) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
left_join(satsa_out_waves)
## # A tibble: 3,840 × 5
## SID p_year name value o_year
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 181 1984 dementia 0 2005
## 2 182 1984 dementia 0 2005
## 3 1101 1984 dementia 0 2005
## 4 1102 1984 dementia 0 2005
## 5 1121 1984 dementia 0 2005
## 6 1122 1984 dementia 0 2005
## 7 1151 1984 dementia 0 2005
## 8 1152 1984 dementia 0 2005
## 9 1211 1984 dementia 0 2005
## 10 1212 1984 dementia 0 2005
## # ℹ 3,830 more rows
2.4.6 Cognition Variables
# composite within years
satsa_cog <- satsa_recode %>%
filter(category == "cognition") %>%
select(-category) %>%
unnest(data) %>%
filter(year == 1985) %>%
filter(!is.na(value)) %>%
group_by(name, itemname, year) %>%
mutate(value = pomp(value)) %>%
group_by(name, SID) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider(names_from = "name"
, values_from = "value")
2.4.7 Combine Data
satsa_combined <- satsa_pers %>%
rename(Trait = name, p_value = value, p_year = year) %>%
full_join(satsa_out %>% rename(Outcome = name, o_value = value)) %>%
full_join(satsa_cov) %>%
left_join(
satsa_out %>%
filter(name == "dementia") %>%
pivot_wider(names_from = "name", values_from = "value")
) %>%
full_join(satsa_cog) %>%
filter(!is.na(p_value) & !is.na(o_value)) %>%
mutate(age = p_year - yearBrth
, o_year = 2005)
## # A tibble: 14,109 × 27
## SID Trait p_year p_value Outcome o_value o_year smokes alcohol cancer diabetes heartProb married
## <dbl> <chr> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1292 A 1984 4 dementia 0 2005 0 0 1 0 1 1
## 2 1292 C 1984 4 dementia 0 2005 0 0 1 0 1 1
## 3 1292 E 1984 3.67 dementia 0 2005 0 0 1 0 1 1
## 4 1292 N 1984 4.33 dementia 0 2005 0 0 1 0 1 1
## 5 1292 O 1984 3.5 dementia 0 2005 0 0 1 0 1 1
## 6 1292 SWL 1984 2.54 dementia 0 2005 0 0 1 0 1 1
## 7 1701 C 1984 5 dementia 0 2005 0 0 0 0 1 NA
## 8 1701 E 1984 3.89 dementia 0 2005 0 0 0 0 1 NA
## 9 1701 N 1984 4 dementia 0 2005 0 0 0 0 1 NA
## 10 1701 O 1984 1 dementia 0 2005 0 0 0 0 1 NA
## # ℹ 14,099 more rows
## # ℹ 14 more variables: parkinsons <dbl>, respProb <dbl>, stroke <dbl>, gender <dbl>, education <dbl>,
## # mmse <dbl>, yearBrth <dbl>, SRhealth <dbl>, height <dbl>, weight <dbl>, BMI <dbl>, dementia <dbl>,
## # cognition <dbl>, age <dbl>
2.5 ADRC Memory and Aging Project (ADRC-MAP)
The Alzheimer Disease Research Center Memory and Aging Project (ADRC-MAP) is an ongoing longitudinal study of memory and Alzheimer’s Disease that began in 1979. Data are available on a study-by-study basis through application from https://knightadrc.wustl.edu/Research/ResourceRequest.htm.
Participants were recruited from the Charles and Joanne F. Knight Alzheimer’s Disease Research Center at Washington University in St. Louis as part of an ongoing study of disease progression. The current study uses a subset of approximately 1200 of these participants who completed personality surveys as part of a substudy (see Duchek et al., 2019). More information on the study can be found at https://knightadrc.wustl.edu/Research/PDFs/Clinical%20Core%20list%20of%20measures.pdf.
Sample sizes vary over time, from approximately 400 to 1200. This provides 99% power to detect a zero-order correlation effect size of ~.15, two-tailed at alpha .05.
2.5.1 Load Data
## # A tibble: 106 × 15
## study dataset category name itemname year orig_itemname description scale reverse_code recode mini
## <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 ADRC NA cognition cogniti… waisBlc… long… PSY021 "WAIS BLOC… Rang… no ifels… NA
## 2 ADRC NA cognition cogniti… waisInfo long… PSY019 "WAIS INFO… Rang… no ifels… NA
## 3 ADRC NA cognition cogniti… waisRDi… long… DIGSYM "WAIS-R DI… Rang… no ifels… NA
## 4 ADRC NA cognition cogniti… digFrwd… long… DIGFORCT "NUMBER SP… Rang… no ifels… NA
## 5 ADRC NA cognition cogniti… digBckw… long… DIGBACCT "NUMBER SP… Rang… no ifels… NA
## 6 ADRC NA cognition cogniti… trailMa… long… TRAILA "The score… Rang… no ifels… NA
## 7 ADRC NA cognition cogniti… trailMa… long… TRAILB "The score… Rang… no ifels… NA
## 8 ADRC NA cognition cogniti… cuedRec… long… SRT1C "Free & Cu… Rang… no ifels… NA
## 9 ADRC NA cognition cogniti… cuedRec… long… SRT2C "Free & Cu… Rang… no ifels… NA
## 10 ADRC NA cognition cogniti… cuedRec… long… SRT3C "Free & Cu… Rang… no ifels… NA
## # ℹ 96 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>
adrc_read_fun <- function(file){
print(file)
d <- sprintf("%s/adrc-map/%s", data_path, file) %>% read_excel(.) %>%
select(SID = id, one_of(c("TESTDATE", old.names)), contains("NEO Date"))
if("TESTDATE" %in% colnames(d)){ if(any(class(d$TESTDATE) != "numeric")){d$TESTDATE <- lubridate::year(d$TESTDATE)}}
d
}
old.names <- unique(adrc_codebook$orig_itemname)
adrc <- tibble(file = list.files(sprintf("%s/adrc-map", data_path), pattern = ".xlsx")) %>%
filter(!grepl("NEO", file)) %>%
mutate(data = map(file, adrc_read_fun)) %>%
filter(map_dbl(data, ncol) > 1)
waves <- adrc %>%
mutate(data = map(data, ~(.) %>% select(SID, one_of("TESTDATE")))) %>%
select(-file) %>%
unnest(data) %>%
filter(complete.cases(.)) %>%
# mutate(year = lubridate::year(TESTDATE)) %>%
distinct() %>%
arrange(SID, TESTDATE) %>%
group_by(SID) %>%
mutate(frstyear = min(TESTDATE)) %>%
ungroup() %>%
mutate(year = TESTDATE,
wave = year - frstyear + 1)
adrc_long <- reduce(adrc$data, full_join) %>%
distinct() %>%
select(SID, year = TESTDATE, everything()) %>%
mutate(BIRTH = lubridate::year(BIRTH)) %>%
pivot_longer(cols = c(-SID, -year)
, names_to = "orig_itemname"
, values_to = "value")
# load personality data separately
old.names <- (adrc_codebook %>% filter(category == "pers"))$orig_itemname
adrc_pers <- sprintf("%s/adrc-map/NEO - Raw Scores_FINAL.xlsx", data_path) %>%
read_xlsx() %>%
select(SID = id, date = `NEO Date 1`, one_of(old.names)) %>%
mutate(date = ifelse(grepl("[//]", date), as.numeric(as.Date(date, format = "%m/%d/%Y")), date),
date = as.Date(as.numeric(date), origin="1899-12-30")) %>%
filter(!is.na(date))
# get waves for participants
adrc_pers_waves <- adrc_pers %>%
select(SID, p_year = date) %>%
distinct() %>%
mutate(p_year = lubridate::year(p_year))
2.5.2 Recode & Reverse-Scoring
adrc_recode <- adrc_codebook %>%
filter(category %in% c("covariates", "outcome", "cognition") & !is.na(orig_itemname)) %>%
select(category, name, itemname, orig_itemname, reverse_code:long_rule) %>%
right_join(adrc_long) %>%
left_join(adrc_pers_waves)
# recode
recode_fun <- function(rule, y, year){
# print(rule)
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
adrc_recode <- adrc_recode %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
# reverse code
adrc_recode <- adrc_recode %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi))) %>%
group_by(category, name) %>%
nest() %>%
ungroup()
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
select = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
2.5.3 Covariates
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, name, year, p_year, long_rule) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
adrc_cov <- adrc_recode %>%
filter(category == "covariates") %>%
unnest(data) %>%
filter(year <= p_year) %>%
mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data) %>%
select(-comp_rule)
comp_fun <- function(d, rule){
d %>%
group_by(SID, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
adrc_cov <- adrc_cov %>%
filter(!is.na(value) & !is.na(name)) %>%
group_by(long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, long_rule, comp_fun)) %>%
unnest(data) %>%
select(-long_rule) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>%
right_join(adrc_pers_waves) %>%
mutate(age = p_year - yearBrth)
## # A tibble: 1,162 × 15
## SID gender yearBrth alcohol cancer diabetes education heartProb married race smokes stroke weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1070 0 1918 1 1 0 17 1 1 0 0 0 187
## 2 1088 0 1916 1 1 0 16 1 1 0 0 0 217
## 3 1380 1 1913 1 1 0 16 1 0 0 0 0 129
## 4 10018 0 1929 0 0 0 14 1 1 0 1 0 156
## 5 10034 0 1930 0 1 0 16 0 1 0 0 0 193
## 6 10037 1 1923 1 1 0 13 1 1 0 1 0 179
## 7 10038 0 1927 1 0 0 18 0 1 0 0 0 205
## 8 10045 1 1931 1 1 0 16 0 1 0 1 0 160
## 9 10054 1 1930 0 1 0 13 1 1 0 1 0 173
## 10 10064 0 1928 1 1 0 20 1 1 0 1 0 199
## # ℹ 1,152 more rows
## # ℹ 2 more variables: p_year <dbl>, age <dbl>
2.5.4 Personality Variables
# bring in codebook info
adrc_pers <- adrc_pers %>%
pivot_longer(`1S1`:`1S60`
, names_to = "orig_itemname"
, values_to = "value"
, values_drop_na = T) %>%
left_join(adrc_codebook %>% select(name:orig_itemname, reverse_code:maxi))
recode_fun <- function(rule, y){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
# recode
adrc_pers <- adrc_pers %>%
group_by(recode) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(recode, data, recode_fun)) %>%
unnest(data)
# reverse code
adrc_pers <- adrc_pers %>%
mutate(value = ifelse(tolower(reverse_code) == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))
# alpha's
adrc_alpha <- adrc_pers %>%
select(name, itemname, date, SID, value) %>%
group_by(name) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-SID, -date)), NA_real_)))
# create composites
adrc_pers <- adrc_pers %>%
group_by(SID, name, date) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
left_join(adrc_pers_waves)
## # A tibble: 5,730 × 5
## SID name date value p_year
## <dbl> <chr> <date> <dbl> <dbl>
## 1 1070 A 2003-10-28 4.08 2003
## 2 1070 C 2003-10-28 4.83 2003
## 3 1070 E 2003-10-28 4.08 2003
## 4 1070 N 2003-10-28 1.08 2003
## 5 1070 O 2003-10-28 3.17 2003
## 6 1088 A 2005-04-18 4.5 2005
## 7 1088 C 2005-04-18 3.42 2005
## 8 1088 E 2005-04-18 3.08 2005
## 9 1088 N 2005-04-18 1.25 2005
## 10 1088 O 2005-04-18 3.08 2005
## # ℹ 5,720 more rows
2.5.5 Outcome Variables
adrc_out <- adrc_recode %>%
filter(category == "outcome") %>%
unnest(data)
adrc_out_waves <- adrc_out %>%
select(year, SID, name) %>%
group_by(SID, name) %>%
summarize(o_year = max(year)) %>%
ungroup()
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, name, year, long_rule, p_year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
adrc_out <- adrc_out %>%
filter(!is.na(value)) %>%
mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
comp_fun <- function(d, rule){
d %>%
group_by(SID, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
adrc_out <- adrc_out %>%
filter(name == "dementia") %>%
mutate(group = ifelse(year <= p_year, "past", "future")) %>%
group_by(SID, name, group, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA)))) %>%
ungroup() %>%
full_join(
adrc_out %>%
filter(name != "dementia" & !is.na(value) & year >= p_year) %>%
group_by(long_rule, p_year) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, long_rule, comp_fun)) %>%
select(-long_rule) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
) %>%
left_join(adrc_out_waves)
## # A tibble: 2,575 × 8
## SID name p_year future past `NA` value o_year
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1070 dementia 2003 1 0 NA 1 2019
## 2 1088 dementia 2005 1 1 NA NA 2015
## 3 1380 dementia 2003 1 0 NA 1 2013
## 4 10018 dementia 2005 0 0 NA 0 2018
## 5 10034 dementia 2004 0 0 NA 0 2020
## 6 10037 dementia 2006 1 0 NA 1 2019
## 7 10038 dementia 2003 1 0 NA 1 2016
## 8 10045 dementia 2003 1 1 NA NA 2018
## 9 10054 dementia 2008 0 0 NA 0 2012
## 10 10064 dementia 2003 1 0 NA 1 2018
## # ℹ 2,565 more rows
2.5.6 Cognition Variables
# composite within years
adrc_cog <- adrc_recode %>%
filter(category == "cognition") %>%
select(-category) %>%
unnest(data) %>%
filter(!is.na(p_year)) %>%
filter(!is.na(value)) %>%
group_by(name, itemname, year) %>%
mutate(value = pomp(value)) %>%
group_by(SID, name, itemname) %>%
filter(year %in% (p_year - 1):(p_year + 1)) %>%
group_by(name, SID) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider(names_from = "name"
, values_from = "value")
## # A tibble: 1,023 × 2
## SID cognition
## <dbl> <dbl>
## 1 1070 4.62
## 2 1088 3.40
## 3 1380 2.36
## 4 10018 5.59
## 5 10034 5.30
## 6 10037 4.64
## 7 10038 6.60
## 8 10045 8.52
## 9 10054 2.41
## 10 10064 7.36
## # ℹ 1,013 more rows
2.5.7 Combine Data
adrc_combined <- adrc_pers %>%
select(SID, Trait = name, p_value = value, p_year) %>%
full_join(
adrc_out %>%
select(SID, o_year, Outcome = name, o_value = value)
) %>%
filter(!is.na(o_value) & !is.na(p_value)) %>%
distinct() %>%
left_join(adrc_cov) %>%
left_join(
adrc_out %>%
filter(name == "dementia") %>%
mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
select(-future, -past) %>%
pivot_wider(names_from = "name", values_from = "value")
) %>%
left_join(adrc_cog) %>%
left_join(adrc_out_waves %>% select(SID, Outcome = name, o_year))
## # A tibble: 11,145 × 23
## SID Trait p_value p_year o_year Outcome o_value gender yearBrth alcohol cancer diabetes education
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1070 A 4.08 2003 2019 dementia 1 0 1918 1 1 0 17
## 2 1070 C 4.83 2003 2019 dementia 1 0 1918 1 1 0 17
## 3 1070 E 4.08 2003 2019 dementia 1 0 1918 1 1 0 17
## 4 1070 N 1.08 2003 2019 dementia 1 0 1918 1 1 0 17
## 5 1070 O 3.17 2003 2019 dementia 1 0 1918 1 1 0 17
## 6 1088 A 4.5 2005 2015 angiopathy 0 0 1916 1 1 0 16
## 7 1088 A 4.5 2005 2015 arterioloscl… 1 0 1916 1 1 0 16
## 8 1088 A 4.5 2005 2015 atherosclero… 2 0 1916 1 1 0 16
## 9 1088 A 4.5 2005 2015 braak 3 0 1916 1 1 0 16
## 10 1088 A 4.5 2005 2015 hipSclerosis 0 0 1916 1 1 0 16
## # ℹ 11,135 more rows
## # ℹ 10 more variables: heartProb <dbl>, married <dbl>, race <dbl>, smokes <dbl>, stroke <dbl>,
## # weight <dbl>, age <dbl>, `NA` <dbl>, dementia <dbl>, cognition <dbl>
2.6 Einstein Aging Study
The Einstein Aging Study (EAS) is an ongoing longitudinal study of the aging brain. The EAS began in 1980 and has enrolled more than 2,600 participants since then. Data are available through application at http://www.einstein.yu.edu/departments/neurology/clinical-research-program/eas/data-sharing.aspx.
Since 1993, the EAS has systematically recruited a representative aging sample in the Bronx, New York, As of 2017, 2,600 participants were enrolled in the study. As of 2010, approximately 200 of the enrolled participants had autopsy data. More information on the study can be found at http://www.einstein.yu.edu/departments/neurology/clinical-research-program/EAS/.
Sample sizes vary over time, with ranges across waves not publically available. However, we suspect approximately 2,000 participants to have basic personality and dementia diagnoses, with between 150 and 300 participants having full autopsy data collected after personality was introduced into the study. This yields 99% power to detect a zero-order correlation effect size of .10 and .24, respectively, two-tailed at alpha .05.
2.6.1 Load Data
## # A tibble: 78 × 15
## study dataset category name itemname year orig_itemname description scale reverse_code recode mini
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 EAS behavior cognition cogni… waisBlk… long… Blockraw "WAIS III:… "int… <NA> ifels… NA
## 2 EAS behavior cognition cogni… trailma… long… Tr-A1 "Other Wor… "int… <NA> ifels… NA
## 3 EAS behavior cognition cogni… trailma… long… Tr-B1 "Other Wor… "int… <NA> ifels… NA
## 4 EAS behavior cognition cogni… digitSym long… Symraw "Digit Sym… "int… <NA> ifels… NA
## 5 EAS behavior cognition cogni… recall long… TotRecall "total Rec… "int… <NA> ifels… NA
## 6 EAS behavior cognition cogni… SPN long… Spnraw "Digit Spa… "int… <NA> ifels… NA
## 7 EAS behavior cognition cogni… CAT long… CAT <NA> "int… <NA> ifels… NA
## 8 EAS behavior cognition cogni… FAS long… FAS <NA> "int… <NA> ifels… NA
## 9 EAS behavior covariates alcoh… alcohol… base… SAB262 "What was … "1=N… no ifels… NA
## 10 EAS behavior covariates alcoh… alcohol… base… SAB263 "What was … "1=N… no ifels… NA
## # ℹ 68 more rows
## # ℹ 3 more variables: maxi <dbl>, comp_rule <chr>, long_rule <chr>
old.names1 <- unique((eas_codebook %>% filter(dataset == "behavior"))$orig_itemname)
old.names2 <- unique((eas_codebook %>% filter(dataset == "neuropath"))$orig_itemname)
old.names3 <- unique((eas_codebook %>% filter(dataset == "activity"))$orig_itemname)
eas <- sprintf("%s/eas/Behavior_with_Master_Data_2021_10_25.xlsx", data_path) %>%
read_excel(., sheet = 1) %>%
mutate(year = lubridate::year(BehaviorDate)) %>%
select(SID = Id, wave = Wave, year, one_of(old.names1)) %>%
full_join(
sprintf("%s/eas/Neuropath_and_Behavior_Data_2021_09_26 (3).xlsx", data_path) %>%
read_excel(.) %>%
select(SID = `Clin#`, year = DOD, wave = Wave, one_of(old.names2)) %>%
mutate(comb_dx = ifelse(c("VaD", "AD", "AGD") %in% Dx1 | c("VaD", "AD", "AGD") %in% Dx2 |
c("VaD", "AD", "AGD") %in% Dx3 | grepl("VaD", `OTHER Dx`) |
grepl("AD", `OTHER Dx`) | grepl("AGD", `OTHER Dx`), 1, 0)
, year = lubridate::year(year)) %>%
select(-(Dx1:`OTHER Dx`))
) %>%
full_join(
sprintf("%s/eas/Northwestern_supp_Physical_Activities_2021_10_25-1.xlsx", data_path) %>%
read_excel(.) %>%
select(SID = Id, wave = Wave, one_of(old.names3))
) %>%
mutate(Gender = ifelse(Gender == "F", 1, ifelse(Gender == "M", 0, NA)))
eas_waves <- eas %>% select(SID, wave, year) %>% distinct()
eas_long <- eas %>%
pivot_longer(values_to = "value"
, names_to = "orig_itemname"
, cols = c(-SID, -wave, -year))
2.6.2 Recode & Reverse-Scoring
eas_recode <- eas_codebook %>%
filter(category %in% c("covariates", "outcome", "cognition", "pers") & !is.na(orig_itemname)) %>%
select(category, name, itemname, orig_itemname, reverse_code:long_rule) %>%
right_join(eas_long)
# recode
recode_fun <- function(rule, y, year){
print(rule)
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
eas_recode <- eas_recode %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data)
# reverse code
eas_recode <- eas_recode %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi))) %>%
group_by(category, name) %>%
nest() %>%
ungroup()
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
select = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
eas_p_waves <- eas_recode %>%
filter(category == "pers") %>%
unnest(data) %>%
filter(!is.na(value)) %>%
group_by(SID) %>%
filter(year == min(year)) %>%
ungroup() %>%
select(SID, p_year = year) %>%
distinct()
2.6.3 Covariates
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, name, year, p_year, long_rule) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
eas_cov <- eas_recode %>%
filter(category == "covariates") %>%
unnest(data) %>%
left_join(eas_p_waves) %>%
filter(year <= p_year) %>%
mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data) %>%
select(-comp_rule)
comp_fun <- function(d, rule){
d %>%
group_by(SID, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
eas_cov <- eas_cov %>%
filter(!is.na(value) & !is.na(name)) %>%
group_by(long_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, long_rule, comp_fun)) %>%
unnest(data) %>%
select(-long_rule) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list(value = max)) %>%
right_join(eas_p_waves)
## # A tibble: 799 × 17
## SID alcohol cancer diabetes heartProb hypertension married race smokes stroke exercise BMI age
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 6075 0 0 0 0 0 1 0 0 0 NA 21.1 73.4
## 2 8024 1 0 0 0 0 1 0 1 0 NA 28.8 94.8
## 3 8027 1 0 0 0 0 1 0 0 0 NA 20.4 93.0
## 4 8265 1 1 1 0 1 1 0 1 0 NA 28.5 77.3
## 5 8291 0 0 0 0 0 1 0 0 0 NA 24 81.7
## 6 8296 1 0 0 0 0 1 0 0 0 NA 23 88.9
## 7 8310 1 0 1 0 0 1 0 1 0 NA 29.4 86.0
## 8 8313 1 0 0 0 0 1 1 1 0 NA NA 87.3
## 9 8375 1 1 0 0 NA 1 0 0 0 NA 26.3 87.2
## 10 8512 1 0 0 0 1 1 1 1 0 NA NA 78.9
## # ℹ 789 more rows
## # ℹ 4 more variables: education <dbl>, SRhealth <dbl>, gender <dbl>, p_year <dbl>
2.6.4 Personality Variables
# bring in codebook info
eas_pers <- eas_recode %>%
filter(category == "pers") %>%
unnest(data) %>%
left_join(eas_p_waves) %>%
filter(year == p_year)
# create composites
eas_pers <- eas_pers %>%
group_by(SID, name, p_year) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup()
## # A tibble: 3,995 × 4
## SID name p_year value
## <dbl> <chr> <dbl> <dbl>
## 1 6075 A 2016 3.6
## 2 6075 C 2016 3.4
## 3 6075 E 2016 2.5
## 4 6075 N 2016 2.8
## 5 6075 O 2016 3.1
## 6 8024 A 2006 3.7
## 7 8024 C 2006 4.1
## 8 8024 E 2006 2.7
## 9 8024 N 2006 2.7
## 10 8024 O 2006 2.9
## # ℹ 3,985 more rows
2.6.5 Outcome Variables
eas_out <- eas_recode %>%
filter(category == "outcome") %>%
unnest(data)
eas_out_waves <- eas_out %>%
filter(!is.na(value)) %>%
select(year, SID, name) %>%
group_by(SID, name) %>%
summarize(o_year = max(year)) %>%
ungroup()
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
# group by person and item (collapse across age)
group_by(SID, name, year, long_rule, p_year) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
}
eas_out <- eas_out %>%
left_join(eas_p_waves) %>%
filter(!is.na(value)) %>%
mutate(comp_rule = ifelse(is.na(comp_rule) | comp_rule == "none", "skip", comp_rule)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
eas_out <- eas_out %>%
filter(name == "dementia") %>%
mutate(group = ifelse(year <= p_year, "past", "future")) %>%
group_by(SID, name, group, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value)) %>%
pivot_wider(names_from = group, values_from = value) %>%
group_by(SID, name) %>%
mutate(value = ifelse(is.na(past) | (past == 0 & !is.na(future)), future,
ifelse(past == 0 & is.na(future), past,
ifelse(past == 1, NA, NA)))) %>%
ungroup() %>%
full_join(eas_out %>%
filter(name != "dementia") %>%
group_by(SID, name, long_rule, p_year) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
select(-long_rule) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
) %>%
left_join(eas_out_waves) %>%
filter(!is.na(value))
## # A tibble: 878 × 8
## SID name p_year past future `NA` value o_year
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8024 dementia 2006 0 NA NA 0 2006
## 2 8027 dementia 2009 0 1 NA 1 2017
## 3 8265 dementia 2006 0 0 NA 0 2011
## 4 8291 dementia 2009 0 0 NA 0 2012
## 5 8296 dementia 2006 0 NA NA 0 2006
## 6 8310 dementia 2007 0 NA NA 0 2007
## 7 8313 dementia 2007 0 NA NA 0 2007
## 8 8375 dementia 2006 0 0 NA 0 2010
## 9 8512 dementia 2010 0 0 NA 0 2013
## 10 8518 dementia 2011 0 NA NA 0 2011
## # ℹ 868 more rows
2.6.6 Cognition Variables
# composite within years
eas_cog <- eas_recode %>%
filter(category == "cognition") %>%
select(-category) %>%
unnest(data) %>%
left_join(eas_p_waves) %>%
filter(!is.na(p_year) & !is.na(value)) %>%
group_by(name, itemname, year) %>%
mutate(value = pomp(value)) %>%
group_by(SID, name, itemname) %>%
filter(year %in% (p_year - 1):(p_year + 1)) %>%
group_by(name, SID) %>%
summarize(value = mean(value, na.rm = T)) %>%
ungroup() %>%
pivot_wider(names_from = "name"
, values_from = "value")
## # A tibble: 799 × 2
## SID cognition
## <dbl> <dbl>
## 1 6075 5.78
## 2 8024 4.83
## 3 8027 4.55
## 4 8265 5.60
## 5 8291 4.64
## 6 8296 5.20
## 7 8310 4.16
## 8 8313 4.37
## 9 8375 6.10
## 10 8512 3.62
## # ℹ 789 more rows
2.6.7 Combine Data
eas_combined <- eas_pers %>%
select(SID, Trait = name, p_value = value, p_year) %>%
full_join(
eas_out %>%
select(SID, o_year, Outcome = name, o_value = value)
) %>%
filter(!is.na(o_value) & !is.na(p_value)) %>%
distinct() %>%
left_join(
eas_out %>%
filter(name == "dementia") %>%
mutate(value = ifelse(rowSums(cbind(future, past), na.rm = T) >= 1, 1, 0)) %>%
select(-future, -past) %>%
pivot_wider(names_from = "name", values_from = "value")
) %>%
left_join(eas_cov) %>%
left_join(eas_cog)
## # A tibble: 4,320 × 25
## SID Trait p_value p_year o_year Outcome o_value `NA` dementia alcohol cancer diabetes heartProb
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8024 A 3.7 2006 2006 dementia 0 NA 0 1 0 0 0
## 2 8024 C 4.1 2006 2006 dementia 0 NA 0 1 0 0 0
## 3 8024 E 2.7 2006 2006 dementia 0 NA 0 1 0 0 0
## 4 8024 N 2.7 2006 2006 dementia 0 NA 0 1 0 0 0
## 5 8024 O 2.9 2006 2006 dementia 0 NA 0 1 0 0 0
## 6 8027 A 3.2 2009 2017 dementia 1 NA 1 1 0 0 0
## 7 8027 A 3.2 2009 2017 braak 5.5 NA 1 1 0 0 0
## 8 8027 A 3.2 2009 2017 hipSclerosis 0 NA 1 1 0 0 0
## 9 8027 A 3.2 2009 2017 lewyBodyDis 1 NA 1 1 0 0 0
## 10 8027 C 3.4 2009 2017 dementia 1 NA 1 1 0 0 0
## # ℹ 4,310 more rows
## # ℹ 12 more variables: hypertension <dbl>, married <dbl>, race <dbl>, smokes <dbl>, stroke <dbl>,
## # exercise <dbl>, BMI <dbl>, age <dbl>, education <dbl>, SRhealth <dbl>, gender <dbl>, cognition <dbl>
2.7 German Socioeconomic Panel Study (GSOEP)
The German Socioeconomic Panel Study (GSOEP; Socio-Economic Panel, 2017) is an ongoing longitudinal study of German collected by the German Institute of Economic Research (DIW Berlin). The data are freely available at https://www.diw.de/soep by application.
Data have been collected annually since 1984 (the latest data release includes data up to 2017). Participants have been recruited from more than 11,000 households, which are nationally representative of private German households. 20,000 individuals are sampled each year, on average. It is critical to note that the GSOEP samples households, not individuals, and the households consist of individuals living in both the “old” and “new” federal states (the former West and East Germany), foreigners, and recent immigrants to Germany.
Sample size varies by year, ranging from approximately 10,000 (1989) to 31,000 (2013). This provides 99% power to detect a zero-order correlation effect size of ~.06, two-tailed at alpha < .05.
2.7.1 Load Data
gsoep_read_fun <- function(Year, WL){
old.names <- (gsoep_codebook %>% filter(year == Year | category == "proc"))$orig_itemname
p <- sprintf("%s/gsoep/%sp.sav", data_path, WL) %>% haven::read_sav(.) %>%
full_join(sprintf("%s/gsoep/%skind.sav", data_path, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/gsoep/%spequiv.sav", data_path, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/gsoep/%spgen.sav", data_path, WL) %>% haven::read_sav(.)) %>%
full_join(sprintf("%s/gsoep/%spkal.sav", data_path, WL) %>% haven::read_sav(.)) %>%
select(one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -persnr, -hhnr, na.rm = T)
sprintf("%s/gsoep/%shbrutto.sav", data_path, WL) %>% haven::read_sav(.) %>%
full_join(sprintf("%s/gsoep/%sh.sav", data_path, WL) %>% haven::read_sav(.)) %>%
select(one_of(old.names)) %>%
gather(key = orig_itemname, value = value, -hhnr, na.rm = T) %>%
full_join(p %>% select(persnr, hhnr) %>% distinct()) %>%
full_join(p)
}
gsoep_codebook <- (codebook %>% filter(study == "GSOEP"))$codebook[[1]] %>%
mutate(orig_itemname = str_to_lower(orig_itemname))
gsoep_codebook
## # A tibble: 594 × 17
## study dataset category name itemname wave waveletter year orig_itemname description scale reverse_code
## <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 gosep cognit cogniti… cogn… animals1 23 w 2006 f99z30r Symbol Dig… inte… no
## 2 gosep cognit cogniti… cogn… animals1 29 bc 2012 f99z30r Symbol Dig… inte… no
## 3 gosep cognit cogniti… cogn… animals2 23 w 2006 f99z60r Symbol Dig… inte… no
## 4 gosep cognit cogniti… cogn… animals2 29 bc 2012 f99z60r Symbol Dig… inte… no
## 5 gosep cognit cogniti… cogn… animals3 23 w 2006 f99z90r Symbol Dig… inte… no
## 6 gosep cognit cogniti… cogn… animals3 29 bc 2012 f99z90r Symbol Dig… inte… no
## 7 gosep cognit cogniti… cogn… symDig1 23 w 2006 f96t30g Symbol Dig… inte… no
## 8 gosep cognit cogniti… cogn… symDig1 29 bc 2012 f96t30g Symbol Dig… inte… no
## 9 gosep cognit cogniti… cogn… symDig2 23 w 2006 f96t60g Symbol Dig… inte… no
## 10 gosep cognit cogniti… cogn… symDig2 29 bc 2012 f96t60g Symbol Dig… inte… no
## # ℹ 584 more rows
## # ℹ 5 more variables: recode <chr>, mini <dbl>, maxi <dbl>, comp_rule <chr>, long_rule <chr>
gsoep <- gsoep_codebook %>%
select(wave, waveletter, year) %>%
filter(complete.cases(.)) %>%
distinct() %>%
arrange(year) %>%
filter(year != "2018") %>%
mutate(data = map2(year, waveletter, gsoep_read_fun))
old.names <- unique(gsoep_codebook$orig_itemname)
gsoep_cog <- sprintf("%s/gsoep/cognit.sav", data_path) %>% haven::read_sav(.) %>%
select(persnr, hhnr, one_of(old.names)) %>%
haven::zap_labels(.) %>%
select(-hhnr) %>%
gather(key = orig_itemname, value = value, -persnr, na.rm = T)
gsoep_long <- gsoep %>% unnest(data) %>%
select(-hhnr, -wave, -waveletter, -year) %>%
# filter(persnr %in% gsoep_cog_subs) %>%
full_join(gsoep_cog) %>%
rename(SID = persnr)
save(gsoep, file = sprintf("%s/data/clean/gsoep_raw.RData", load_path))
rm(gsoep)
2.7.2 Recoding & Reverse Scoring
gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()
# join data with recoding info
gsoep_recode <- gsoep_codebook %>%
filter(category %in% c("pers", "outcome", "covariates", "cognition")) %>%
select(category, name, itemname, wave, year, orig_itemname, reverse_code:long_rule) %>%
group_by(category, name) %>%
nest() %>%
ungroup() %>%
mutate(data = map(data, ~(.) %>% left_join(gsoep_long)))
# recode
recode_fun <- function(rule, y, year){
x <- y$value
if(!is.na(rule)){y$value <- eval(parse(text = rule))}
return(y)
}
gsoep_recode <- gsoep_recode %>%
mutate(data = map(data, ~(.) %>%
group_by(recode, year) %>%
nest() %>%
ungroup() %>%
mutate(data = pmap(list(recode, data, year), recode_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(value < 0 | is.nan(value) | is.infinite(value), NA, value))))
# reverse code
gsoep_recode <- gsoep_recode %>%
mutate(data = map(data, ~(.) %>%
mutate(value = ifelse(reverse_code == "no" | is.na(reverse_code), value,
reverse.code(-1, value, mini = mini, maxi = maxi)))))
fun_call <- function(x, rule){
switch(rule,
average = mean(x, na.rm = T),
mode = Mode(x)[1],
sum = sum(x, na.rm = T),
skip = unique(x)[1],
select = unique(x)[1],
max = max(x, na.rm = T),
min = min(x, na.rm = T))
}
2.7.3 Covariates
yrBrth <- gsoep_recode %>%
filter(name == "yearBrth") %>%
unnest(data) %>%
group_by(SID) %>%
summarize(yearBrth = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(yearBrth = ifelse(is.infinite(yearBrth), NA, yearBrth),
yearBrth = ifelse(yearBrth < 1000, yearBrth + 1000, yearBrth))
# compositing within years
year_comp_fun <- function(df, rule, name){
print(paste(rule, name))
df %>%
group_by(SID, yearBrth, year, long_rule) %>% # group by person and item (collapse across age)
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()
gsoep_cov <- gsoep_recode %>%
filter(category == "covariates") %>%
mutate(data = ifelse(name == "alcohol", map(data, ~(.) %>% mutate(year = ifelse(year == 2006, 2005, year))), data)) %>%
mutate(data = map(data, ~(.) %>%
left_join(yrBrth) %>%
filter(year <= max(gsoep_waves$Used) & !is.na(value)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(is.na(comp_rule), "skip", comp_rule)))) %>%
filter(map(data, nrow) > 0) %>%
unnest(data) %>%
mutate(data = pmap(list(data, comp_rule, name), year_comp_fun)) %>%
unnest(data)
comp_fun <- function(rule, p_year){
gsoep_cov %>%
filter(year <= p_year & long_rule == rule) %>%
group_by(SID, yearBrth, name) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
gsoep_cov <- crossing(
p_year = gsoep_waves$Used,
long_rule = unique(gsoep_cov$long_rule)
) %>%
mutate(data = map2(long_rule, p_year, comp_fun)) %>%
unnest(data) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value)) %>%
select(-long_rule, -yearBrth) %>%
pivot_wider(names_from = name, values_from = value) %>%
mutate(yearBrth = ifelse(yearBrth < 1000, yearBrth + 1000, yearBrth))
## # A tibble: 114,106 × 16
## p_year SID exercise height weight cancer diabetes married stroke alcohol education smokes gender
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2005 101 1.67 NA NA 0 0 1 0 NA NA NA 0
## 2 2005 102 1 NA NA 0 0 1 0 NA NA NA 1
## 3 2005 103 2.67 NA NA 0 0 0 0 NA NA NA 0
## 4 2005 201 2 157 61 0 0 0 0 1 11 0 1
## 5 2005 202 3 NA NA 0 0 0 0 NA NA NA 1
## 6 2005 203 4 177 77 0 0 0 0 1 NA 0 0
## 7 2005 301 1 NA NA 0 0 1 0 NA NA NA 0
## 8 2005 302 1 NA NA 0 0 1 0 NA NA NA 1
## 9 2005 401 1 NA NA 0 0 0 0 NA NA NA 0
## 10 2005 501 1 NA NA 0 0 0 0 NA NA NA 1
## # ℹ 114,096 more rows
## # ℹ 3 more variables: yearBrth <dbl>, age <dbl>, SRhealth <dbl>
2.7.4 Personality Variables
gsoep_pers <- gsoep_recode %>%
filter(category == "pers") %>%
unnest(data) %>%
filter(year %in% gsoep_waves$Used) %>%
distinct()
# alpha's
gsoep_alpha <- gsoep_pers %>%
select(name, itemname, year, SID, value) %>%
group_by(name, year) %>%
nest() %>%
mutate(data = map(data, ~(.) %>% pivot_wider(names_from = itemname, values_from = value)),
alpha = map(data, possibly(~psych::alpha((.) %>% select(-persnr)), NA_real_)))
comp_fun <- function(df, rule){
df %>%
group_by(SID) %>%
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
# create composites
gsoep_pers <- gsoep_pers %>%
group_by(name, comp_rule, year) %>%
nest() %>%
ungroup() %>%
mutate(data = map2(data, comp_rule, comp_fun)) %>%
unnest(data) %>%
select(-comp_rule) %>%
mutate(value = ifelse(is.infinite(value) | is.nan(value), NA, value))
## # A tibble: 191,511 × 4
## name year SID value
## <chr> <dbl> <dbl> <dbl>
## 1 A 2005 201 7
## 2 A 2005 203 4.67
## 3 A 2005 602 4.33
## 4 A 2005 901 4.67
## 5 A 2005 1202 7
## 6 A 2005 1501 4.67
## 7 A 2005 1601 4
## 8 A 2005 1602 6.67
## 9 A 2005 1603 4.33
## 10 A 2005 1701 7
## # ℹ 191,501 more rows
2.7.5 Outcome Variables
gsoep_pers_subs <- unique(gsoep_pers$SID)
gsoep_waves <- p_waves %>% filter(Study == "GSOEP") %>% select(Used) %>% distinct()
# composite within years
# compositing within years
year_comp_fun <- function(df, rule){
df %>%
group_by(SID, name, year) %>% # group by person and item (collapse across age)
summarize(value = fun_call(value, rule)) %>%
ungroup()
}
gsoep_out <- gsoep_recode %>%
filter(category == "outcome") %>%
unnest(data) %>%
# filter(year <= max(gsoep_waves$Used)) %>%
group_by(comp_rule) %>%
nest() %>%
ungroup() %>%
mutate(comp_rule = ifelse(comp_rule == "select", "skip", comp_rule),
data = map2(data, comp_rule, year_comp_fun)) %>%
unnest(data)
# composite across years
comp_fun <- function(p_year){
gsoep_out %>%
group_by(SID, name) %>%
summarize(value = max(value, na.rm = T)) %>%
ungroup() %>%
mutate(value = ifelse(is.infinite(value), NA, value))
}
gsoep_out <- tibble(p_year = gsoep_waves$Used) %>%
mutate(data = map(p_year, comp_fun)) %>%
unnest(data) %>%
mutate(o_year = 2017)
## # A tibble: 109,214 × 5
## p_year SID name value o_year
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 2005 602 dementia 0 2017
## 2 2005 604 dementia 0 2017
## 3 2005 901 dementia 0 2017
## 4 2005 1501 dementia 0 2017
## 5 2005 1601 dementia 0 2017
## 6 2005 1602 dementia 0 2017
## 7 2005 2301 dementia 0 2017
## 8 2005 2302 dementia 0 2017
## 9 2005 4701 dementia 0 2017
## 10 2005 4901 dementia 0 2017
## # ℹ 109,204 more rows
2.7.6 Cognition Variables
gsoep_cog <- gsoep_recode %>%
filter(category == "cognition") %>%
unnest(data) %>%
filter(!is.na(value))
gsoep_cog_waves <- gsoep_cog %>%
select(itemname, SID, year) %>%
group_by(SID, itemname) %>%
summarize(o_year = max(year)) %>%
ungroup()
gsoep_cog <- gsoep_cog %>%
right_join(gsoep_cog_waves) %>%
filter(year == o_year) %>%
filter(!is.na(value)) %>%
group_by(name, itemname, year) %>%
mutate(value = pomp(value)) %>%
group_by(name, o_year, SID) %>%
summarize(value = mean(value)) %>%
ungroup() %>%
pivot_wider(names_from = "name", values_from = "value")
## # A tibble: 22,445 × 3
## o_year SID cognition
## <dbl> <dbl> <dbl>
## 1 2012 201 0
## 2 2012 1501 3.81
## 3 2012 5601 2.06
## 4 2012 6002 2.46
## 5 2012 7302 1.44
## 6 2012 8603 3.75
## 7 2012 9801 3.88
## 8 2012 11301 3.45
## 9 2012 12303 2.66
## 10 2012 13401 1.98
## # ℹ 22,435 more rows
2.7.7 Combine Data
gsoep_combined <- gsoep_pers %>%
rename(Trait = name, p_value = value, p_year = year) %>%
full_join(gsoep_out %>% select(p_year, SID, Outcome = name, o_year, o_value = value)) %>%
full_join(gsoep_cov) %>%
left_join(
gsoep_out %>%
filter(name == "dementia") %>%
pivot_wider(names_from = "name", values_from = "value")
) %>%
full_join(gsoep_cog) %>%
filter(!is.na(p_value) & !is.na(o_value)) %>%
mutate(age = p_year - yearBrth
, BMI = weight/((height/100)^2))
## # A tibble: 136,465 × 24
## Trait p_year SID p_value Outcome o_year o_value exercise height weight cancer diabetes married stroke
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 2005 602 4.33 dementia 2017 0 2.8 177 87 0 0 1 0
## 2 A 2005 901 4.67 dementia 2017 0 2.8 158 47 0 0 0 0
## 3 A 2005 1501 4.67 dementia 2017 0 1 168. 84 0 0 0 0
## 4 A 2005 1601 4 dementia 2017 0 2.67 174. 75 0 0 1 0
## 5 A 2005 1602 6.67 dementia 2017 0 2 160. 48 0 0 1 0
## 6 A 2005 2301 5 dementia 2017 0 1.8 180 75 0 0 1 0
## 7 A 2005 2302 3.33 dementia 2017 0 3 157 46.5 0 0 1 0
## 8 A 2005 4701 6 dementia 2017 0 2.2 168 70 0 0 0 0
## 9 A 2005 4901 5.33 dementia 2017 0 1 162. 66.5 0 0 0 0
## 10 A 2005 5201 6.33 dementia 2017 0 1.07 183 83 0 0 1 0
## # ℹ 136,455 more rows
## # ℹ 10 more variables: alcohol <dbl>, education <dbl>, smokes <dbl>, gender <dbl>, yearBrth <dbl>,
## # age <dbl>, SRhealth <dbl>, dementia <dbl>, cognition <dbl>, BMI <dbl>