Chapter 5 Specification Curve Analysis

5.1 Load Data

5.2 SCA Data Setup

5.3 Functions

5.3.4 Full Models and Workspace Setup

sca_setup_fun <- function(trait, outcome, fixed = "p_value"){
  terms <- sca_term_fun(trait, outcome)
  terms <- if(trait == "OP" & outcome == "mvInPrtnr"){terms[terms != "alcohol"]}
  f <- sca_formula_fun(terms)
  # wd <- "~/selection"
  load(sprintf("%s/data/sca/sca_%s_%s.RData", wd, trait, outcome))
  df1 <- df1 
  std <- df1 %>%
    group_by(study, o_value) %>%
    tally() %>%
    full_join(crossing(study = unique(.$study), o_value = factor(c(0,1))))
  std <- unique(std$study[std$n < 50 | is.na(std$n)])
  df1 <- df1 %>% 
    filter(!study %in% std) %>% 
    mutate(o_value = as.numeric(as.character(o_value)),
           n = row_number()) 
  df2 <- df1 %>%
    select(study, SID, p_value, o_value, n, one_of(terms)) %>% 
    filter(complete.cases(.)) #%>%
    # mutate_if(is.factor, factor)
  
  
  start <- Sys.time()
  # optCtrl=list(maxfun=2e5))
  full_mod <- femlm(formula(f), data = df2, family = "logit")
  # full_mod <- glmer(formula(f), data = df1, family = "binomial", na.action = "na.omit", control = cntrl)
  print(Sys.time() - start)
  gmCall <- get_call(full_mod)
  gmEnv <- parent.frame()
  gmFormulaEnv <- environment(as.formula(formula(full_mod), env = gmEnv))
  
  # set up terms
  
  allTerms <- allTerms0 <- getAllTerms.fixest(full_mod, terms)
  deps <- attr(allTerms0, "deps")
  fixed <- union(fixed, rownames(deps)[rowSums(deps, na.rm = TRUE) == 
                                         ncol(deps)])
  fixed <- c(fixed, allTerms[allTerms %in% "(Intercept)"])
  nFixed <- length(fixed)
  gmCoefNames <- gmCoefNames0 <- names(coef(full_mod))
  
  # this chunk makes sure that interactions include main effects
  varsOrder <- order(allTerms %in% fixed)
  termsOrder <- order(gmCoefNames %in% fixed)
  allTerms <- allTerms[varsOrder]
  gmCoefNames <- gmCoefNames[termsOrder]
  di <- match(allTerms, rownames(deps))
  deps <- deps[di, di, drop = FALSE]
  
  # setup output
  nVars <- length(allTerms)
  nTerms <- length(gmCoefNames)
  # rvNcol <- nVars + 3L + 2
  # rtNcol <- nTerms + 3L + 2
  # resultChunkSize <- 1000000L
  # rval <- matrix(NA_real_, ncol = rtNcol, nrow = resultChunkSize)
  
  nFixed <- 2  # p_value & intercept
  nVariants <- 1L
  nov <- as.integer(nVars - nFixed)
  ncomb <- (2L^nov) 
  comb.sfx <- rep(TRUE, nFixed)
  comb.seq <- if (nov != 0L) {seq_len(nov)}
  # ord <- integer(resultChunkSize)
  
  pval_out <- sapply(1:500, function(x){sample_fun(x, df1)}) # change back to 500
  # perm_res <- array(NA_real_, c(10, 3, resultChunkSize)) # change back to 500
  # no_cores <- detectCores() - 1
  
  k <- 0L
  save(list = ls(all.names = TRUE), 
       file = sprintf("%s/results/sca_workspace/sca_workspace_%s_%s.RData", wd, outcome, trait))
  return(ncomb)
}

5.3.5 Specifications and Permutations for Cluster

sca_run_fun <- function(trait, outcome, min, max){
  load(sprintf("/scratch/edbeck/sca_workspace/sca_workspace_%s_%s.RData", wd, outcome, trait))
  k <- 0L
  seq_comb <- seq(min, max, 1)
  # seq_comb <- seq_comb[1:2] # remove after testing
  
  res <- lapply(seq_comb, function(iComb){
    varComb <- iComb%%nVariants
    jComb <- (iComb - varComb)%/%nVariants
    if (varComb == 0L) {
      isok <- TRUE
      comb <- c(as.logical(intToBits(jComb)[comb.seq]), 
                comb.sfx)
      nvar <- sum(comb) - 1
      if (!formula_margin_check(comb, deps)) {
        isok <- FALSE
        res <- NA
      }
      new_terms <- allTerms[comb]
      if(sum(grepl(":", new_terms)) > 1){
        isok <- FALSE
        res <- NA
      }
    }
    if (!isok) {
      res <- NA
    } else{
      f1 <- paste("o_value ~ ", # outcome
                  paste(new_terms[new_terms != "(Intercept)"], collapse = " + "), # fixed effects
                  " | study", collapse = "")
      std <- df1 %>%
        group_by(study, o_value) %>%
        tally() %>%
        full_join(crossing(study = unique(.$study), o_value = c(0,1)))
      std <- unique(std$study[std$n < 50 | is.na(std$n)])
      df3 <- df1 %>%
        filter(!study %in% std) %>% 
        select(study:o_value, n, one_of(new_terms)) %>%
        mutate(o_value = as.numeric(as.character(o_value))) %>%
        filter(complete.cases(.)) 
      #   filter(study %in% std) 
      fit <- femlm(formula(f1), data = df3, family = "logit")
      print("made it to creating fit")
      mci1 <- confint(fit, parm = "p_value")
      mcoef1 <- matchCoef(fit, all.terms = gmCoefNames)
      ll1    <- logLik(fit)
      bic1 <- BIC(fit)
      nobs1  <- nobs(fit)
      psr1 <- fit$pseudo_r2
      row1   <- c(mcoef1[gmCoefNames], ci.lower = mci1[1,1], ci.upper = mci1[1,2],
                  bic = bic1, psr2 = psr1, ll = ll1)
      perm1 <- t(sapply(1:500, function(ii) perm_fun(ii, df3, f1, as.numeric(pval_out[,ii]))))
      print(Sys.time() - start)
      
      k <- k + 1L
      print(sprintf("%s %s %s", iComb, f1, k))
      res <- list(rval = row1, perm_res = perm1, ord = iComb)
      rm(list = c("row1", "perm1", "fit", "mci1", "mcoef1", "ll1", "nobs1"))
      gc()
    }
    return(res)
  })
  
  res <- res[!is.na(res)]
  
  rval <- ldply(res, `[[`, 1)
  perm_res <- llply(res, `[[`, 2)
  ord <- laply(res, `[[`, 3)
  perm_res <- abind::abind(perm_res, along = 3)
  
  # colnames(rval) <- c(gmCoefNames, "ci.lower", "ci.upper", "df", "ll")
  row.names(rval) <- ord
  # rval[, seq_along(gmCoefNames)] <- rval[, v <- order(termsOrder)]
  save(rval, file = sprintf("/scratch/edbeck/raw/raw_%s_%s_%s_%s.RData", trait, outcome, min, max))
  save(perm_res, file = sprintf("/scratch/edbeck/perm/perm_%s_%s_%s_%s.RData", trait, outcome, min, max))
}

5.5 Re-Compiling

5.5.3 Inferential-Based Tests

Simonsohn et al. define 3 inferential-based tests: 1. the median overall point estimate within each specification curve.
2. the percentage of specifications that are of the dominant sign.
3. the percentage of specifications with the dominant sign that are also significant.

# median permuted personality-outcome association for each model
perm_med_fun <- function(m){
  apply(m[, 1, ], 1, function(x){ # m[, 1, ] = p_value column for each specification with length = 500 perm
    median(x, na.rm = T)
  })
}

# number of (out of 500) dominant permuted sign of personality-outcome associations for each  model  
perm_dom_sign_fun <- function(m){
  sapply(1:500, function(i){
    x <- m[i, 1, ]
    signs <- sign(x)
    dom <- if(sum(signs == 1, na.rm = T) > sum(signs == -1, na.rm = T)){1} else {-1}
    num_dom <- sum(signs == dom, na.rm = T)
    return(c("dom_sign" = dom, "num_dom" = num_dom))
    # return(dom)
  }, simplify = T) %>% t()
}

# number of (out of 500) dominant and significant permuted sign of personality-outcome associations for each  model  
perm_sig_fun <- function(m, dom){
  apply(m, 1, function(x){
    sum(sign(x[2,]) == sign(x[3,]) & sign(x[1,]) == dom, na.rm = T)
  })
}

# combine permuted results across different models 
# previously split up to run smaller batches of models
combine_fun <- function(d1){
  m<-abind::abind(d1, along = 3)
  perm_med <- perm_med_fun(m)
  perm_sign <- perm_dom_sign_fun(m)
  perm_num_sign <- perm_sign[,"num_dom"]; perm_sign <- perm_sign[,"dom_sign"]
  perm_sig <- perm_sig_fun(m, perm_sign["dom_sign"])
  tibble(perm_med, perm_sign, perm_num_sign, perm_sig)
}

# compute inferential permutation tests by comparing raw results to each of the permutations  
perm_inf_test_fun <- function(perm,med, sign, sig){
  tibble(med_test = sum(abs(perm$perm_med) > abs(med))/500,
         sign_test = sum(perm$perm_num_sign > sign)/500,
         sig = sum(perm$perm_sig > sig)/500)
}

5.5.4 Plotting Function

# plot the results
sca_plot_fun <- function(d1, type, trait, outcome){
  d <- round(max(abs(1-min(exp(d1$p_value))), abs(1-exp(max(d1$p_value)))), 2)
  lim <- c(1-d-(d/2.5), 1+d+(d/2.5))
  brk <- round(c(1-d-(d/5), 1, 1+d+(d/5)),2)
  lab <- str_remove(round(c(1-d-(d/5), 1, 1+d+(d/5)),2), "^0")
  ns <- diff(lim) / 100;  sg <- diff(lim) / 50
  top <- d1 %>%
    arrange(p_value) %>%
    mutate(sig = ifelse(sign(ci.lower) == sign(ci.upper), "sig", "ns"),
           specification = 1:n()) %>%
    mutate_at(vars(-(Min:Max), -n, -(bic:specification)), exp) %>%
    mutate(p_min = ifelse(sig == "sig", p_value - sg, p_value - ns),
           p_max = ifelse(sig == "sig", p_value + sg, p_value + ns)) %>%
    ggplot(aes(x = specification, y = p_value)) +
      scale_y_continuous(limits = lim, breaks = brk, labels = lab) +
      scale_color_manual(values = c("black", "red")) +
      geom_hline(aes(yintercept = 1), linetype = "dashed") +
      geom_linerange(aes(ymin = p_min, ymax = p_max, color = sig), size = .05) +
      # geom_errorbar(aes(color = sig, ymin = ci.lower, ymax = ci.upper), 
              # width = 0, size = .01, alpha = .4) +
      # geom_point(shape = "|", aes(color = sig), size = .25) +
      labs(x = "\nSpecification Number", 
             y = str_wrap(sprintf("Effect of %s on %s (OR)\n", trait, outcome), 30), 
             title = bquote(.(trait) %->% .(outcome))) + 
      theme_minimal(base_size = 11) +
      theme(legend.title = element_text(size = 10),
            legend.text = element_text(size = 9),
            axis.text = element_text(color = "black"),
            axis.line = element_line(colour = "black"),
            legend.position = "none",
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
            panel.background = element_blank(),
            plot.title = element_text(hjust = .5, face = "bold"))
  
  if(type == "perm"){
    top <- top + 
      geom_smooth(aes(y = perm_lower), color = "gray", linetype = "dotted", se = F) +
      geom_smooth(aes(y = perm_upper), color = "gray", linetype = "dotted", se = F) +
      geom_smooth(aes(y = perm_median), color = "lightblue", linetype = "dashed", se = F) 
  }
  
  bottom <- d1 %>% 
    arrange(p_value) %>%
    mutate(sig = ifelse(sign(ci.lower) == sign(ci.upper), "sig", "ns"),
           specification = 1:n(),
           # model = fct_reorder(specification, .x = p_value, .fun = min)
           ) %>%
    select(-(perm_median:perm_upper), -(bic:ll)) %>%
    gather(variable, value, -(Min:n), -(ci.lower:specification)) %>% 
    mutate(value = ifelse(!is.na(value), "|", "")) %>%
    ggplot(aes(specification, variable, color = sig)) +
      # geom_point(aes(size = sig), shape = "|") +
      geom_text(aes(label = value), size = rel(2)) +
      scale_color_manual(values = c("black", "red")) +
      # scale_size_manual(values = c(2,3))+
      labs(x = "\nSpecification Number", y = "Variables\n") + 
      theme_minimal(base_size = 11) +
      theme(legend.title = element_text(size = 10),
            legend.text = element_text(size = 9),
            axis.text = element_text(color = "black"),
            axis.line = element_line(colour = "black"),
            legend.position = "none",
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
            panel.background = element_blank())
  
  p_val = cowplot::plot_grid(top, bottom, ncol = 1, align = "v", labels = c('A', 'B'))
  # file <- sprintf("%s/results/sca/plots/%s/sca_%s_%s.pdf", res_path, type, outcome, trait)
  # ggsave(p_val, width = 12, height = 8, file = file)
  outcome <- mapvalues(outcome, outcomes$long_name, outcomes$short_name, warn_missing = F)
  trait <- mapvalues(trait, traits$long_name, traits$short_name, warn_missing = F)
  file <- sprintf("%s/results/sca/plots/png/%s_%s_%s.png", res_path, type, outcome, trait)
  ggsave(p_val, width = 12, height = 8, file = file)
}

5.5.5 Compiling Function

# Compile the results 
# written as below because of RAM issues. Everything done at once.
sca_compile_fun <- function(trait, outcome, d){
  # bring in raw data 
  raw <- d %>% 
    mutate(raw = map(files, raw_read_fun),
           Min = as.numeric(Min)) %>%
    select(-files, -type, -scrap) %>%
    arrange(Min) %>% 
    unnest(raw) %>% 
    mutate(n = 1:n())
  # get values for inferential tests
  med_size <- median(raw$p_value)
  dom_sign <- dom_sign_fun(raw$p_value)
  sig_sign <- with(raw, num_sig_fun(ci.lower, ci.upper, p_value, dom_sign["dom_sign"]))
  raw_test <- c(med_size, dom_sign, sig_sign)
  names(raw_test) <- c("med_size", "dom_sign", "num_dom", "sig_sign")
  
  # bring in permutations
  perm <- d  %>% 
    mutate(data = map(files, perm_read_fun),
           quant = map(data, sum_fun),
           Min = as.numeric(Min)) %>%
  select(-files, -type, -scrap) %>%
  arrange(Min) 
  # get values for inferential test
  perm_test <- combine_fun(perm$data)
  
  # perform inferential permutation test
  test <- perm_inf_test_fun(perm_test, med_size, dom_sign["num_dom"], sig_sign)
  
  # plotting 
  d <- perm %>% select(-data) %>% unnest(quant) %>% mutate(n = 1:n()) %>%
    full_join(raw)
  
  sca_plot_fun(d, "raw", trait, outcome)
  sca_plot_fun(d, "perm", trait, outcome)
  
  results <- list(inf_test = test, raw_test = raw_test, perm_test = perm_test)
  save(results,  file = sprintf("%s/results/sca/inf_test/%s_%s.RData", res_path, outcome, trait))
  rm(list = c("d", "test", "raw_test", "perm_test", "perm", "raw"))
  gc()
  return(results)
}

done <- tibble(
  files = sprintf("%s/results/sca/plots/png", res_path) %>% list.files()
) %>%
  separate(files, c("scrap", "Outcome", "Trait"), sep = "_") %>%
  mutate(Trait = str_remove(Trait, ".png"),
         done = "done") %>%
  group_by(Outcome,Trait) %>%
  filter(n()==2) %>%
  ungroup() %>%
  select(-scrap) %>%
  distinct()

plan(multiprocess(workers = 3L))
sca_res_nested <- tibble(
  files = sprintf("%s/results/sca/raw", wd) %>% list.files()
) %>% 
  separate(files, c("type", "Trait", "Outcome", "Min", "Max", "scrap"), remove = F) %>%
  arrange(Min) %>%
  group_by(Trait, Outcome) %>% 
  nest() %>% 
  filter(map(data, nrow) == 150) %>%
  ungroup() %>%
  filter(Trait %in% c("E", "A", "C", "N", "O")) %>%
  # full_join(done) %>%
  # filter(is.na(done)) %>%
  mutate(Trait = mapvalues(Trait, traits$short_name, traits$long_name),
         Outcome = mapvalues(Outcome, outcomes$short_name, outcomes$long_name)) %>%
  mutate(#results = pmap(list(Trait, Outcome, data), sca_compile_fun))
         results = future_pmap(list(Trait, Outcome, data)
         , sca_compile_fun
         , .progress = T
         , .options = future_options(
           globals = c("combine_fun", "dom_sign_fun", "num_sig_fun",
                       "perm_dom_sign_fun", "perm_inf_test_fun",
                       "perm_med_fun", "perm_read_fun", "wd", "res_path",
                       "perm_sig_fun", "raw_read_fun", "read_fun",
                       "sca_plot_fun", "sum_fun", "traits", "outcomes")
           , packages = c("plyr", "tidyverse", "cowplot", "abind"))))
closeAllConnections()

5.6 Results

5.6.1 Tables

## # A tibble: 14 x 4
## # Groups:   Outcome [14]
##    Outcome                  `1`   `2`   `3`
##    <chr>                  <int> <int> <int>
##  1 Child Birth                3    NA     2
##  2 Child Moves Out            5    NA    NA
##  3 Criminality                1    NA     4
##  4 Divorce                   NA    NA     5
##  5 First Job                  2     2     1
##  6 Higher Ed                  3     1     1
##  7 Major Health Event         4     1    NA
##  8 Marriage                   1     1     3
##  9 Mental Health Event       NA     1     4
## 10 Mortality                  1     3     1
## 11 Move in with a partner     2    NA     3
## 12 Retirement                 1     1     3
## 13 Unemployment              NA    NA     5
## 14 Volunteering              NA    NA     5
t_names <- c(1, rep(3, 5)); names(t_names) <- c(" ", traits$long_name)
cols_names <- paste(rep(c("med_test", "sign_test", "sig"), times = 5), 
      rep(traits$short_name, each = 3), sep = "_")

sca_res %>% 
  select(Outcome, Trait, inf_test) %>%
  unnest(inf_test) %>%
  # pivot_longer(cols = c("med_test", "sign_test", "sig")
  #              , names_to = "Test"
  #              , values_to = "val") %>%
  mutate(Trait = mapvalues(Trait, traits$long_name, traits$short_name)) %>%
  pivot_wider(names_from = "Trait"
              , values_from = c("med_test", "sign_test", "sig")) %>%
  select(Outcome, all_of(cols_names)) %>%
  # mutate(Test = mapvalues(Test, c("med_test", "sign_test", "sig"), c("Median", "Sign", "Significance"))) %>%
  mutate_at(vars(-Outcome), ~ifelse(. < .05, sprintf("<strong>%.2f</strong",.), sprintf("%.2f",.))) %>%
  kable(.
        , "html"
        , escape = F
        , col.names = c("Outcome", rep(c("Median", "Sign", "Signif"), times = 5))
        , align = c("r", rep("c", 15))
        , caption = "Results of the Permuation Based Inference Tests from Specification Curve Analyses") %>%
  kable_styling(full_width = F) %>%
  collapse_rows(1
                , valign = "top"
                , row_group_label_position = "stack") %>%
  add_header_above(t_names) %>%
  footnote("Median = Percentage of permutations in which the raw median effect size of the dominant sign was larger than the permuted median effect size; Sign = Percentage of permutations in which the total number of raw specifications of the dominant sign was greater than permuted. Signifiance = Percentage of permutations in which the total number of raw, significant specifications of the dominant sign was greater than the permuted.") %>%
  save_kable(., file = sprintf("%s/results/sca/inferential_tests.html", res_path))