Week 6: Visualizing Uncertainty

Emorie D Beck

Visualizing Uncertainty

Packages

library(RColorBrewer)
library(knitr)
library(kableExtra)
library(plyr)
library(broom)
library(modelr)
library(lme4)
library(broom.mixed)
library(tidyverse)
library(ungeviz)
library(ggdist)
library(tidybayes)
library(distributional)
library(gganimate)

Visualizing Uncertainty

  • Why is visualizing uncertainty important?
    • Point estimates are over-emphasized and interval estimates are unemphasized (or ignored)
    • Most people misperceive both (1) common uncertainty visualizations and (2) most common uncertainty metrics
    • In other words, people make errors about error
    • Probability is hard, and most aren’t taught about probability distributions (and more)

Theories of Visualizing Uncertainty

Why do people misperceive uncertainty, and how can we mitigate it?

Quick Side Note: Custom Themes

  • We have a lot to cover today, so I’m going to skip over some of the usual “how to start with the basics and make it aesthetically pleasing”
  • Instead, we’ll create a custom these that captures some of our usual additions
  • This will save us both time and text!
  • I highly recommend doing this in your own work
my_theme <- function(){
  theme_classic() + 
  theme(
    legend.position = "bottom"
    , legend.title = element_text(face = "bold", size = rel(1))
    , legend.text = element_text(face = "italic", size = rel(1))
    , axis.text = element_text(face = "bold", size = rel(1.1), color = "black")
    , axis.title = element_text(face = "bold", size = rel(1.2))
    , plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
    , plot.subtitle = element_text(face = "italic", size = rel(1.2), hjust = .5)
    , strip.text = element_text(face = "bold", size = rel(1.1), color = "white")
    , strip.background = element_rect(fill = "black")
    )
}

Error Bars

  • First, let’s examine the usual ways that we show uncertainty around point estimates (e.g., means, model parameter estimates, etc.) using interval estimates (+/- 1 SE/D, confidence interval)
# A tibble: 480,798 × 11
    year   SID SRhealth satHealth marital chldbrth gender  yearB…¹ morta…²   job
   <dbl> <dbl>    <dbl>     <dbl>   <dbl>    <dbl> <dbl+l>   <dbl>   <dbl> <dbl>
 1  1984   901       NA        NA      NA       NA 2 [[2]…    1951       0    NA
 2  1984  1001       NA        NA      NA       NA 2 [[2]…    1913       0    NA
 3  1984  1101       NA        NA      NA       NA 2 [[2]…    1906       0    NA
 4  1984  1201       NA        NA      NA       NA 1 [[1]…    1911       0    NA
 5  1984  1202       NA        NA      NA       NA 2 [[2]…    1913       0    NA
 6  1984  1301       NA        NA      NA       NA 2 [[2]…    1943       0    NA
 7  1984  1302       NA        NA      NA       NA 1 [[1]…    1965       0    NA
 8  1984  1901       NA        NA      NA       NA 2 [[2]…    1948       0    NA
 9  1984  2001       NA        NA      NA       NA 2 [[2]…    1949       0    NA
10  1984  2002       NA        NA      NA       NA 1 [[1]…    1952       0    NA
# … with 480,788 more rows, 1 more variable: age <dbl>, and abbreviated
#   variable names ¹​yearBrth, ²​mortality

Error Bars

  • First, let’s examine the usual ways that we show uncertainty around point estimates (e.g., means, model parameter estimates, etc.) using interval estimates (+/- 1 SE/D, confidence interval)
pomp <- function(x) (x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))*10
gsoep_desc <- gsoep %>%
  filter(year == 2005 & age < 30) %>%
  filter(SID %in% sample(unique(.$SID), 500)) %>%
  mutate(SRhealth = pomp(SRhealth)) %>%
  group_by(SID, age) %>%
  mutate(health = rowMeans(cbind(SRhealth, satHealth), na.rm = T)) %>%
  ungroup() %>%
  select(SID, age, health) %>%
  drop_na() %>%
  mutate(
    mean = mean(health)
    , sd = sd(health)
    , se = sd/sqrt(n())
    , ci99 = se*2.576
    , ci95 = se*1.96
    , ci80 = se*1.282
    )
gsoep_desc
# A tibble: 202 × 9
       SID   age health  mean    sd    se  ci99  ci95  ci80
     <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1   18804    28   7.75  7.59  1.99 0.140 0.362 0.275 0.180
 2   22305    19  10     7.59  1.99 0.140 0.362 0.275 0.180
 3 1145002    28   8.75  7.59  1.99 0.140 0.362 0.275 0.180
 4   29703    18   7.75  7.59  1.99 0.140 0.362 0.275 0.180
 5   35803    25   7.75  7.59  1.99 0.140 0.362 0.275 0.180
 6   67704    18   6     7.59  1.99 0.140 0.362 0.275 0.180
 7   75503    17   7.5   7.59  1.99 0.140 0.362 0.275 0.180
 8  112506    22  10     7.59  1.99 0.140 0.362 0.275 0.180
 9  125905    23   5.5   7.59  1.99 0.140 0.362 0.275 0.180
10  133004    23   5.5   7.59  1.99 0.140 0.362 0.275 0.180
# … with 192 more rows

Error Bars

  • First, let’s examine the usual ways that we show uncertainty around point estimates (e.g., means, model parameter estimates, etc.) using interval estimates (+/- 1 SE/D, confidence interval)
  • But we need to reshape the data to allow us to actually plot them:
gsoep_desc <- gsoep_desc %>% 
  select(mean:ci80, health, SID) %>%
  pivot_longer(
    cols = c(-mean, -SID)
    , names_to = "measure"
    , values_to = "value"
    ) %>%
  mutate(SID = ifelse(measure == "health" | row_number() %in% 1:5, SID, NA)) %>%
  drop_na() %>%
  mutate(measure2 = factor(measure, rev(c("health", "sd", "se", "ci99", "ci95", "ci80")))
         , measure = as.numeric(mapvalues(measure, c("health", "sd", "se", "ci99", "ci95", "ci80"), 6:1)))
gsoep_desc
# A tibble: 207 × 5
    mean     SID measure  value measure2
   <dbl>   <dbl>   <dbl>  <dbl> <fct>   
 1  7.59   18804       5  1.99  sd      
 2  7.59   18804       4  0.140 se      
 3  7.59   18804       3  0.362 ci99    
 4  7.59   18804       2  0.275 ci95    
 5  7.59   18804       1  0.180 ci80    
 6  7.59   18804       6  7.75  health  
 7  7.59   22305       6 10     health  
 8  7.59 1145002       6  8.75  health  
 9  7.59   29703       6  7.75  health  
10  7.59   35803       6  7.75  health  
# … with 197 more rows

Error Bars

So What Do We Do?

Outline for Today:

  • Proportions / Probability
    • icon array
  • Point Estimates
    • half-eye
    • gradient interval
    • quantile dotplot
    • raincloud
  • Animated (sometimes)
    • hypothetical outcome plots
    • ensemble display

Proportions / Probability

Proportions / Probability

  • We already covered proportions and probability, but this one deserves being highlighted itself
  • How much of our sample was unmarried?
gsoep %>%
  filter(year == 2012 & !is.na(marital)) %>%
  mutate(marital = ifelse(marital == 4, "Never Married", "Married")) %>%
  group_by(marital) %>%
  tally() %>%
  ungroup() %>%
  mutate(prop = round(n/sum(n)*100))
# A tibble: 2 × 3
  marital           n  prop
  <chr>         <int> <dbl>
1 Married       10806    76
2 Never Married  3381    24

Proportions / Probability

  • We already covered proportions and probability, but this one deserves being highlighted itself
  • How much of our sample was unmarried?
  • We have to trick ggplot2 into making this figure with a grid
tibble(
  value = c(rep(1, 76), rep(2,24))
  , x = rep(1:10, each = 10)
  , y = rep(1:10, times = 10)
  ) 
# A tibble: 100 × 3
   value     x     y
   <dbl> <int> <int>
 1     1     1     1
 2     1     1     2
 3     1     1     3
 4     1     1     4
 5     1     1     5
 6     1     1     6
 7     1     1     7
 8     1     1     8
 9     1     1     9
10     1     1    10
# … with 90 more rows

Proportions / Probability

  • We already covered proportions and probability, but this one deserves being highlighted itself
  • How much of our sample was unmarried?
tibble(
  value = c(rep(1, 76), rep(2,24))
  , y = rep(1:10, each = 10)
  , x = rep(1:10, times = 10)
  ) %>%
  ggplot(aes(x = x, y = y, color = factor(value))) +
    geom_point(shape = "square", size = 5) + 
    theme_void() + 
    theme(legend.position = "none")

Proportions / Probability

  • We already covered proportions and probability, but this one deserves being highlighted itself
  • How much of our sample was unmarried?
tibble(
  value = c(rep(1, 76), rep(2,24))
  , y = rep(1:10, each = 10)
  , x = rep(1:10, times = 10)
  ) %>%
  ggplot(aes(x = x, y = y, color = factor(value))) +
    geom_point(shape = "square", size = 8) + 
    scale_color_manual(values = c("lightgrey", "darkblue")) + 
    labs(title = "24% Remained Unmarried in 2012") + 
    theme_void() + 
    theme(legend.position = "none"
          , plot.title = element_text(hjust = .5, face = "bold"))

Point Estimates

Point Estimates

  • Most often, we want to visualize either point esimates or other visualizations of models
  • We touched on this a couple of weeks ago when we talked about broom and broom.mixed
  • I mentioned then that one of the challenges comes from where the interval estimate comes from, which includes:
    • (Frequentist) Standard Errors
    • (Frequentist) Confidence Intervals
    • Bootstrapped / Profile Confidence Intervals
    • Prediction Intervals
    • (Bayesian) Credible Intervals
    • (Bayesian) Posterior Distributions
  • I’ll stay out of Bayes for now :(

Point Estimates

gsoep_ex <- gsoep %>%
  filter(year == "2000") %>%
  select(SID, age, marital, gender) %>%
  inner_join(
    gsoep %>%
      filter(year == "2015") %>%
      select(SID, SRhealth)
  ) %>%
  mutate(marital = factor(
    marital
    , 1:4
    , c("Married", "Separated", "Widowed", "Never Married")
    ), age = age/10
    , gender = factor(gender, c(1,2), c("Male", "Female"))) %>%
  drop_na()
gsoep_ex
# A tibble: 3,431 × 5
       SID   age marital       gender SRhealth
     <dbl> <dbl> <fct>         <fct>     <dbl>
 1     901   4.9 Never Married Female        3
 2    2301   5.4 Separated     Male          4
 3    2302   5.4 Separated     Female        3
 4    5201   4.5 Married       Male          2
 5    5202   4.4 Married       Female        2
 6    5203   1.8 Never Married Female        4
 7    5303   3.1 Never Married Male          5
 8 1077202   2.6 Never Married Male          3
 9    7301   7   Married       Male          3
10    7302   6.2 Married       Female        2
# … with 3,421 more rows

Point Estimates From Model Predictions

Marginal Means from Model Predictions

Point Estimates and Marginal Means

  • stat_halfeye(): Visual Boundaries
  • stat_eye(): Visual Boundaries
  • stat_gradientinterval(): Visual Semiotics
  • stat_dots(): Frequency Framing
  • stat_dotsinterval(): Frequency Framing
  • stat_halfeye()+ stat_dots(): Visual Boundaries + Frequency Framing

Point Estimates and Marginal Means

Core syntax

  • The benefit of ggdist is that it allows you to use essentially identical syntax to produce lots of different kinds of plots
  • All we have to do is swap out the geom
m1 <- lm(SRhealth ~ marital + age, data = gsoep_ex)
tidy1 <- tidy(m1)

tidy1 %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(y = term)) + 
    stat_halfeye(
        aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
    ) +
  my_theme()

stat_halfeye()

We can pull the predictions from model terms or marginal means

Model Terms:

tidy1 %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(y = term)) + 
    stat_halfeye(
        aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
    ) +
  my_theme()

Marginal Means:

gsoep_ex %>%
  data_grid(marital) %>%
  mutate(age = mean(gsoep_ex$age)) %>%
  augment(m1, newdata = ., se_fit = T) %>%
  ggplot(aes(y = marital)) + 
    stat_halfeye(
        aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)), 
    ) +
  my_theme()

Point Estimates and Marginal Means

Core syntax

Let’s do a little hack and create our whole plots except the geom, so that we can build them with less syntax:

p1 <- tidy1 %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(y = term)) + 
  geom_vline(aes(xintercept = 0), linetype = "dashed") + 
  labs(
    x = "Parameter Estimate"
    , y = NULL
    , title = "Model Estimates"
    , caption = "Outcome: Self-Rated Health"
    ) +
  my_theme()
p2 <- gsoep_ex %>%
  data_grid(marital) %>%
  mutate(age = mean(gsoep_ex$age)) %>%
  augment(m1, newdata = ., se_fit = T) %>%
  ggplot(aes(y = marital)) + 
  labs(
    x = "Model Predicted Self-Rated Health"
    , title = "Marginal Means"
    , y = NULL
    ) +
  my_theme()

stat_halfeye()

We can pull the predictions from model terms or marginal means

Model Terms:

p1 +
  stat_halfeye(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
  ) + 
  labs(subtitle = "stat_halfeye()")

Marginal Means:

p2 + 
  stat_halfeye(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)), 
  ) + 
  labs(subtitle = "stat_halfeye()")

stat_eye()

Model Terms:

p1 +
  stat_eye(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
  ) + 
  labs(subtitle = "stat_eye()")

Marginal Means:

p2 + 
  stat_eye(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)), 
  ) + 
  labs(subtitle = "stat_eye()")

stat_gradientinterval()

Model Terms:

p1 +
  stat_gradientinterval(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
  ) + 
  labs(subtitle = "stat_gradientinterval()")

Marginal Means:

p2 + 
  stat_gradientinterval(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)), 
  ) + 
  labs(subtitle = "stat_gradientinterval()")

stat_dots()

Model Terms:

p1 +
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
  ) + 
  labs(subtitle = "stat_dots()")

Marginal Means:

p2 + 
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)), 
  ) + 
  labs(subtitle = "stat_dots()")

stat_dots()

You can also change the number of dots:

Model Terms:

p1 +
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
      , quantiles = 50
  ) + 
  labs(subtitle = "stat_dots()")

Marginal Means:

p2 + 
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
      , quantiles = 50
  ) + 
  labs(subtitle = "stat_dots()")

stat_dots()

There are also three different layouts

layout = "bin":

p2 + 
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
      , quantiles = 50
      , layout = "bin"
  ) + 
  labs(subtitle = "stat_dots()")

layout = "weave":

p2 + 
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
      , quantiles = 50
      , layout = "weave"
  ) + 
  labs(subtitle = "stat_dots()")

layout = "swarm":

p2 + 
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
      , quantiles = 50
      , layout = "swarm"
  ) + 
  labs(subtitle = "stat_dots()")

stat_dotsinterval()

Model Terms:

p1 +
  stat_dotsinterval(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
      , quantiles = 50
  ) + 
  labs(subtitle = "stat_dotsinterval()")

Marginal Means:

p2 + 
  stat_dotsinterval(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
      , quantiles = 50
  ) + 
  labs(subtitle = "stat_dotsinterval()")

stat_dotsinterval()

You can apply many of the same arguments as “regular” stat_dots()

Model Terms:

p1 +
  stat_dotsinterval(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
      , quantiles = 50
      , layout = "swarm"
  ) + 
  labs(subtitle = "stat_dotsinterval()")

Marginal Means:

p2 + 
  stat_dotsinterval(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
      , quantiles = 50
      , layout = "swarm"
  ) + 
  labs(subtitle = "stat_dotsinterval()")

stat_halfeye()+ stat_dots()

Model Terms:

p1 +
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
      , quantiles = 50
      , side = "bottomleft"
      , layout = "swarm"
  ) + 
  stat_halfeye(
    aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
  ) + 
  labs(subtitle = "`stat_halfeye()`+ `stat_dots()")

Marginal Means:

p2 + 
  stat_dots(
      aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
      , quantiles = 50
      , side = "bottomleft"
      , layout = "swarm"
  ) + 
  stat_halfeye(
    aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit)) 
  ) + 
  labs(subtitle = "`stat_halfeye()`+ `stat_dots()")

Simple Slopes

Let’s say we want to know if married and unmarried people differ in their self-rated health as a function of their age:

gsoep_ex2 <- gsoep_ex %>%
  filter(marital %in% c("Married", "Never Married"))
gsoep_ex2
# A tibble: 2,979 × 5
       SID   age marital       gender SRhealth
     <dbl> <dbl> <fct>         <fct>     <dbl>
 1     901   4.9 Never Married Female        3
 2    5201   4.5 Married       Male          2
 3    5202   4.4 Married       Female        2
 4    5203   1.8 Never Married Female        4
 5    5303   3.1 Never Married Male          5
 6 1077202   2.6 Never Married Male          3
 7    7301   7   Married       Male          3
 8    7302   6.2 Married       Female        2
 9   10101   6.2 Married       Male          2
10   12201   5.8 Married       Male          3
# … with 2,969 more rows
m2 <- lm(SRhealth ~ age + marital + age:marital, data = gsoep_ex2)
tidy(m2)
# A tibble: 4 × 5
  term                     estimate std.error statistic  p.value
  <chr>                       <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                4.14      0.0751    55.1   0       
2 age                       -0.215     0.0153   -14.1   1.64e-43
3 maritalNever Married       0.0834    0.122      0.683 4.95e- 1
4 age:maritalNever Married  -0.0296    0.0334    -0.887 3.75e- 1

Simple Slopes

We can plot this using an extension of geom_ribbon(), stat_lineribbon()

gsoep_ex2 %>%
  group_by(marital) %>%
  data_grid(age= seq_range(age, n = 101)) %>%
  ungroup() %>%
  augment(m2, newdata = ., se_fit = T) %>%
  ggplot(aes(x = age*10, fill = ordered(marital), color = ordered(marital))) +
    stat_lineribbon(
      aes(ydist = dist_student_t(df = df.residual(m2), mu = .fitted, sigma = .se.fit)),
      alpha = 1/4) + 
    scale_fill_brewer(palette = "Set2") +
    scale_color_brewer(palette = "Dark2") +
    labs(x = "Age (Years)", y = "Predicted Self-Rated Health"
         , fill = "Marital Status", color = "Marital Status") + 
    my_theme()

Simple Slopes

  • I promised I wouldn’t go Bayes, but I never promised I wouldn’t bootstrap!
  • Knowing how to get bootstrapped confidence and/or prediction intervals is important, especially if you work with any sort of multilevel / hierarchical models
  • Getting bootstrapped interval estimates is easy to get point estimates, but it’s a little harder if we want to get it around prediction lines
    • (i.e. there’s no great built in funcitons in R that get it for you, in my opinion)

Simple Slopes

But first, let’s get some longitudinal data that let’s us look at the interaction between marital status and changes in self-rated health within a person as they age:

set.seed(5)
gsoep_ex3 <- gsoep %>%
  select(SID, age, marital, gender, SRhealth) %>%
  filter(marital %in% c(1,4)) %>%
  group_by(SID) %>%
  mutate(marital = min(marital, na.rm = T)) %>%
  ungroup() %>%
  mutate(marital = factor(
    marital
    , c(1,4)
    , c("Married", "Never Married")
    ), age = age/10
    , gender = factor(gender, c(1,2), c("Male", "Female"))) %>%
  drop_na()

gsoep_ex3 <- gsoep_ex3 %>%
  filter(SID %in% sample(unique(gsoep_ex3$SID), 2000))
gsoep_ex3
# A tibble: 11,915 × 5
      SID   age marital gender SRhealth
    <dbl> <dbl> <fct>   <fct>     <dbl>
 1   7401   4.6 Married Male          3
 2 714802   3.5 Married Male          2
 3  17602   6.5 Married Female        2
 4  20501   4.4 Married Male          4
 5  22302   3.9 Married Female        4
 6  24204   2.9 Married Female        5
 7  24302   5.2 Married Male          4
 8  29702   3.7 Married Female        4
 9  32001   5.5 Married Male          4
10  32103   2.4 Married Male          5
# … with 11,905 more rows

Simple Slopes

The critical term is the interaction between the two:

m3 <- lmer(SRhealth ~ age + marital + age:marital + (age | SID), data = gsoep_ex3)
tidy(m3)
# A tibble: 8 × 6
  effect   group    term                     estimate std.error statistic
  <chr>    <chr>    <chr>                       <dbl>     <dbl>     <dbl>
1 fixed    <NA>     (Intercept)                4.58      0.0585     78.4 
2 fixed    <NA>     age                       -0.241     0.0126    -19.1 
3 fixed    <NA>     maritalNever Married       0.119     0.0910      1.30
4 fixed    <NA>     age:maritalNever Married  -0.0629    0.0271     -2.32
5 ran_pars SID      sd__(Intercept)            0.831    NA          NA   
6 ran_pars SID      cor__(Intercept).age      -0.728    NA          NA   
7 ran_pars SID      sd__age                    0.181    NA          NA   
8 ran_pars Residual sd__Observation            0.617    NA          NA   

Changes in health differ across marital groups

Simple Slopes

  • But how?
    • Interactions can often be tricky to unpack by point estimates alone
    • So we may want to plot separate trajectories for married and unmarried people

Simple Slopes

predIntlme4 <- function(m, mod_frame, ref){
  ## get bootstrapped estimates
  b <- bootMer(
    m
    , FUN = function(x) lme4:::predict.merMod(
      x
      , newdata = mod_frame
      , re.form = ref
      )
    , nsim = 100 # do not use 100 in practice, please
    , parallel = "multicore"
    , ncpus = 16
    )
  
  ## get long form bootstrapped draws
  b_df <- bind_cols(
    mod_frame
    , t(b$t) %>%
    data.frame()
  ) %>%
    pivot_longer(
       cols = c(-age, -marital)
      , names_to = "boot"
      , values_to = "pred"
    )
  return(list(boot = b, b_df = b_df))
}
pred_fx_fun <- function(m){
  mod_frame <- crossing(
    age = seq(min(m@frame$age), max(m@frame$age), .1)
    , marital = levels(m@frame$marital)
  )
  boot <- predIntlme4(m = m, mod_frame = mod_frame, ref = NA)
}

boot3 <- pred_fx_fun(m3)
b_df3 <- boot3$b_df
b3 <- boot3$boot

Simple Slopes

b_df3
# A tibble: 15,000 × 4
     age marital boot   pred
   <dbl> <chr>   <chr> <dbl>
 1   1.7 Married X1     4.19
 2   1.7 Married X2     4.17
 3   1.7 Married X3     4.19
 4   1.7 Married X4     4.20
 5   1.7 Married X5     4.19
 6   1.7 Married X6     4.17
 7   1.7 Married X7     4.14
 8   1.7 Married X8     4.19
 9   1.7 Married X9     4.16
10   1.7 Married X10    4.16
# … with 14,990 more rows

geom_line()

b_df3 %>%
  ggplot(aes(x = age, y = pred)) + 
    geom_line(
      aes(color = marital, group = interaction(marital, boot))
      , alpha = .2, size = .25
      ) + 
    my_theme()

geom_lineribbon(): summarized

b_df3 %>% 
  group_by(marital, age) %>%
  median_qi(pred)
# A tibble: 150 × 8
   marital   age  pred .lower .upper .width .point .interval
   <chr>   <dbl> <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
 1 Married   1.7  4.17   4.10   4.23   0.95 median qi       
 2 Married   1.8  4.14   4.07   4.21   0.95 median qi       
 3 Married   1.9  4.12   4.05   4.18   0.95 median qi       
 4 Married   2    4.10   4.03   4.16   0.95 median qi       
 5 Married   2.1  4.07   4.01   4.13   0.95 median qi       
 6 Married   2.2  4.05   3.99   4.11   0.95 median qi       
 7 Married   2.3  4.02   3.96   4.08   0.95 median qi       
 8 Married   2.4  4.00   3.94   4.06   0.95 median qi       
 9 Married   2.5  3.98   3.92   4.03   0.95 median qi       
10 Married   2.6  3.95   3.90   4.01   0.95 median qi       
# … with 140 more rows

geom_lineribbon(): summarized

b_df3 %>%
  group_by(marital, age) %>%
  median_qi(pred) %>%
  ggplot(aes(
    x = age
    , y = pred
    , ymin = .lower
    , ymax = .upper
    )) +
  geom_lineribbon(
    aes(fill = marital)
    , size = .9
    , alpha = .8
    ) + 
  scale_fill_brewer(palette = "Set2") +
  my_theme()

geom_lineribbon() bands: summarized

b_df3 %>%
  group_by(marital, age) %>%
  median_qi(pred, .width = c(.50, .80, .95)) %>%
  ggplot(aes(
    x = age
    , y = pred
    , ymin = .lower
    , ymax = .upper
    )) +
  geom_lineribbon(size = .9) + 
  scale_fill_brewer() +
  facet_grid(~marital) + 
  my_theme()

stat_lineribbon() bands: samples

b_df3 %>%
  ggplot(aes(
    x = age
    , y = pred
    , fill = marital
    )) + 
  stat_lineribbon(alpha = .25) + 
  my_theme()

stat_lineribbon() bands: samples

b_df3 %>%
  ggplot(aes(
    x = age
    , y = pred
    , fill = marital
    )) + 
  stat_lineribbon(
    aes(fill_ramp = stat(level))
    ) +
  my_theme()

geom_lineribbon() gradient: samples

b_df3 %>%
  ggplot(aes(
    x = age
    , y = pred
    , fill = marital
    )) + 
  stat_lineribbon(
    alpha = .25
    , .width = ppoints(25)
    ) + 
  scale_fill_brewer(palette = "Set2") +
  my_theme()

geom_lineribbon() gradient: samples

ms <- b_df3 %>% filter(age == max(age)) %>% group_by(marital) %>% summarize(m = mean(pred))

b_df3 %>%
  ggplot(aes(x = age*10, y = pred, fill = marital, fill_ramp = stat(.width))) + 
  stat_lineribbon(
    alpha = .25
    , .width = ppoints(25)
    ) +
  scale_x_continuous(limits = c(15,100), breaks = seq(15,90,15)) + 
  scale_fill_manual(values = c("grey", "darkorange")) + 
  annotate(
    "text", label = "Married", x = max(b_df3$age)*10+1
    , y = ms$m[1], hjust = 0
    ) + 
  annotate(
    "text", label = "Never\nMarried", x = max(b_df3$age)*10+1
    , y = ms$m[2], hjust = 0
    ) + 
  labs(
    x = "Age (Years)"
    , y = "Predicted Self Rated Health\n(Bootstrapped Interval Estimates)"
    , fill = NULL
    , title = "Self-Rated Health Declines More Rapidly\nfor Unmarried People"
    ) + 
  guides(fill = "none") + 
  my_theme()

Animated Uncertainty

Ensemble Displays

  • Ensemble displays are an alternative to putting hard boundaries around an interval estimate
  • Remember that hard boundaries make people interpret categorical differences even when the underlying distribution is continuous
  • We’ve already seen this:

Ensemble Displays

  • But the challenge with visualizing uncertainty is between inference and understanding
  • We need to leverage a knowledge of perception and cognitive processes to help us leverage strengths and overcome weaknesses
  • Animating visualizations can help us nudge people to process was they see and update their uncertainty estimates over time
  • We’ll use the gganimate package to do this

Ensemble Displays

Here’s a quick example:

Ensemble Displays

Let’s break this down:

b_df3 %>%
  group_by(marital, age) %>%
  median_qi(pred) 
# A tibble: 150 × 8
   marital   age  pred .lower .upper .width .point .interval
   <chr>   <dbl> <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
 1 Married   1.7  4.17   4.10   4.23   0.95 median qi       
 2 Married   1.8  4.14   4.07   4.21   0.95 median qi       
 3 Married   1.9  4.12   4.05   4.18   0.95 median qi       
 4 Married   2    4.10   4.03   4.16   0.95 median qi       
 5 Married   2.1  4.07   4.01   4.13   0.95 median qi       
 6 Married   2.2  4.05   3.99   4.11   0.95 median qi       
 7 Married   2.3  4.02   3.96   4.08   0.95 median qi       
 8 Married   2.4  4.00   3.94   4.06   0.95 median qi       
 9 Married   2.5  3.98   3.92   4.03   0.95 median qi       
10 Married   2.6  3.95   3.90   4.01   0.95 median qi       
# … with 140 more rows

Ensemble Displays

Let’s break this down: First, we’ll plot just the ribbons across the full sample based on the bootstrapped predictions

b_df3 %>%
  group_by(marital, age) %>%
  median_qi(pred) %>%
  ggplot(aes(x = age, y = pred)) +
    geom_ribbon(
      aes(fill = marital, ymin = .lower, ymax = .upper)
      , size = .9, alpha = .5
      ) + 
    scale_fill_brewer(palette = "Set2") +
    facet_grid(~marital) + 
    my_theme() + 
    theme(legend.position = "none")

Ensemble Displays

Let’s break this down: Next we’ll add a line for each bootstrapped sample. This doesn’t look great here.

b_df3 %>%
  group_by(marital, age) %>%
  median_qi(pred) %>%
  ggplot(aes(x = age, y = pred)) +
    geom_ribbon(
      aes(fill = marital, ymin = .lower, ymax = .upper)
      , size = .9, alpha = .5
      ) + 
    scale_fill_brewer(palette = "Set2") +
    geom_line(
      data = b_df3
      , aes(group = interaction(marital, boot))
      , size = 1
      ) +
    facet_grid(~marital) + 
    my_theme() + 
    theme(legend.position = "none")

Ensemble Displays

Let’s break this down: But when we use the transition_states() function across the bootstraped samples, we can get a better sense about where possible regression lines fall:

b_df3 %>%
  group_by(marital, age) %>%
  median_qi(pred) %>%
  ggplot(aes(x = age, y = pred)) +
    geom_ribbon(
      aes(fill = marital, ymin = .lower, ymax = .upper)
      , size = .9, alpha = .5
      ) + 
    scale_fill_brewer(palette = "Set2") +
    geom_line(
      data = b_df3
      , aes(group = interaction(marital, boot))
      , size = 1
      ) +
    facet_grid(~marital) + 
    my_theme() + 
    theme(legend.position = "none") + 
    transition_states(boot, 1, 1)

Hypothetical Outcome Plots (HOPs)

  • Similarly, hypothetical outcome plots let us see plausible mean estimates among raw data
  • Here’s self-rated health (1-5) across married and unmarried individuals:
gsoep_ex3 %>%
  ggplot(aes(y = marital, x = SRhealth)) + 
    geom_jitter(aes(color = marital), alpha = .5) + 
    my_theme()

Hypothetical Outcome Plots (HOPs)

  • Using the ungeviz package, we can then use the geom_vpline() function to sample from the data across groups and plot the mean from different samples:
gsoep_ex3 %>%
  ggplot(aes(y = marital, x = SRhealth)) + 
    geom_jitter(aes(color = marital), alpha = .25) + 
    geom_vpline(
      data = sampler(25, group = marital)
      , height = 0.6
      , color = "#D55E00"
      ) +
    scale_color_manual(values = c("seagreen2", "darkorange")) + 
    my_theme()

Hypothetical Outcome Plots (HOPs)

And finally, we can animate those samples the transition_states() function from the gganimate package again:

gsoep_ex3 %>%
  ggplot(aes(y = marital, x = SRhealth)) + 
    geom_jitter(aes(color = marital), alpha = .5) + 
    geom_vpline(
      data = sampler(25, group = marital)
      , height = 0.6
      , color = "#D55E00"
      ) +
    scale_color_manual(values = c("seagreen2", "darkorange")) + 
    my_theme() + 
    transition_states(.draw, 1, 3)