Emorie D Beck
Why do people misperceive uncertainty, and how can we mitigate it?
my_theme <- function(){
theme_classic() +
theme(
legend.position = "bottom"
, legend.title = element_text(face = "bold", size = rel(1))
, legend.text = element_text(face = "italic", size = rel(1))
, axis.text = element_text(face = "bold", size = rel(1.1), color = "black")
, axis.title = element_text(face = "bold", size = rel(1.2))
, plot.title = element_text(face = "bold", size = rel(1.2), hjust = .5)
, plot.subtitle = element_text(face = "italic", size = rel(1.2), hjust = .5)
, strip.text = element_text(face = "bold", size = rel(1.1), color = "white")
, strip.background = element_rect(fill = "black")
)
}
# A tibble: 480,798 × 11
year SID SRhealth satHealth marital chldbrth gender yearB…¹ morta…² job
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl+l> <dbl> <dbl> <dbl>
1 1984 901 NA NA NA NA 2 [[2]… 1951 0 NA
2 1984 1001 NA NA NA NA 2 [[2]… 1913 0 NA
3 1984 1101 NA NA NA NA 2 [[2]… 1906 0 NA
4 1984 1201 NA NA NA NA 1 [[1]… 1911 0 NA
5 1984 1202 NA NA NA NA 2 [[2]… 1913 0 NA
6 1984 1301 NA NA NA NA 2 [[2]… 1943 0 NA
7 1984 1302 NA NA NA NA 1 [[1]… 1965 0 NA
8 1984 1901 NA NA NA NA 2 [[2]… 1948 0 NA
9 1984 2001 NA NA NA NA 2 [[2]… 1949 0 NA
10 1984 2002 NA NA NA NA 1 [[1]… 1952 0 NA
# … with 480,788 more rows, 1 more variable: age <dbl>, and abbreviated
# variable names ¹yearBrth, ²mortality
pomp <- function(x) (x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))*10
gsoep_desc <- gsoep %>%
filter(year == 2005 & age < 30) %>%
filter(SID %in% sample(unique(.$SID), 500)) %>%
mutate(SRhealth = pomp(SRhealth)) %>%
group_by(SID, age) %>%
mutate(health = rowMeans(cbind(SRhealth, satHealth), na.rm = T)) %>%
ungroup() %>%
select(SID, age, health) %>%
drop_na() %>%
mutate(
mean = mean(health)
, sd = sd(health)
, se = sd/sqrt(n())
, ci99 = se*2.576
, ci95 = se*1.96
, ci80 = se*1.282
)
gsoep_desc
# A tibble: 202 × 9
SID age health mean sd se ci99 ci95 ci80
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 18804 28 7.75 7.59 1.99 0.140 0.362 0.275 0.180
2 22305 19 10 7.59 1.99 0.140 0.362 0.275 0.180
3 1145002 28 8.75 7.59 1.99 0.140 0.362 0.275 0.180
4 29703 18 7.75 7.59 1.99 0.140 0.362 0.275 0.180
5 35803 25 7.75 7.59 1.99 0.140 0.362 0.275 0.180
6 67704 18 6 7.59 1.99 0.140 0.362 0.275 0.180
7 75503 17 7.5 7.59 1.99 0.140 0.362 0.275 0.180
8 112506 22 10 7.59 1.99 0.140 0.362 0.275 0.180
9 125905 23 5.5 7.59 1.99 0.140 0.362 0.275 0.180
10 133004 23 5.5 7.59 1.99 0.140 0.362 0.275 0.180
# … with 192 more rows
gsoep_desc <- gsoep_desc %>%
select(mean:ci80, health, SID) %>%
pivot_longer(
cols = c(-mean, -SID)
, names_to = "measure"
, values_to = "value"
) %>%
mutate(SID = ifelse(measure == "health" | row_number() %in% 1:5, SID, NA)) %>%
drop_na() %>%
mutate(measure2 = factor(measure, rev(c("health", "sd", "se", "ci99", "ci95", "ci80")))
, measure = as.numeric(mapvalues(measure, c("health", "sd", "se", "ci99", "ci95", "ci80"), 6:1)))
gsoep_desc
# A tibble: 207 × 5
mean SID measure value measure2
<dbl> <dbl> <dbl> <dbl> <fct>
1 7.59 18804 5 1.99 sd
2 7.59 18804 4 0.140 se
3 7.59 18804 3 0.362 ci99
4 7.59 18804 2 0.275 ci95
5 7.59 18804 1 0.180 ci80
6 7.59 18804 6 7.75 health
7 7.59 22305 6 10 health
8 7.59 1145002 6 8.75 health
9 7.59 29703 6 7.75 health
10 7.59 35803 6 7.75 health
# … with 197 more rows
gsoep %>%
filter(year == 2012 & !is.na(marital)) %>%
mutate(marital = ifelse(marital == 4, "Never Married", "Married")) %>%
group_by(marital) %>%
tally() %>%
ungroup() %>%
mutate(prop = round(n/sum(n)*100))
# A tibble: 2 × 3
marital n prop
<chr> <int> <dbl>
1 Married 10806 76
2 Never Married 3381 24
ggplot2
into making this figure with a gridtibble(
value = c(rep(1, 76), rep(2,24))
, y = rep(1:10, each = 10)
, x = rep(1:10, times = 10)
) %>%
ggplot(aes(x = x, y = y, color = factor(value))) +
geom_point(shape = "square", size = 8) +
scale_color_manual(values = c("lightgrey", "darkblue")) +
labs(title = "24% Remained Unmarried in 2012") +
theme_void() +
theme(legend.position = "none"
, plot.title = element_text(hjust = .5, face = "bold"))
broom
and broom.mixed
gsoep_ex <- gsoep %>%
filter(year == "2000") %>%
select(SID, age, marital, gender) %>%
inner_join(
gsoep %>%
filter(year == "2015") %>%
select(SID, SRhealth)
) %>%
mutate(marital = factor(
marital
, 1:4
, c("Married", "Separated", "Widowed", "Never Married")
), age = age/10
, gender = factor(gender, c(1,2), c("Male", "Female"))) %>%
drop_na()
gsoep_ex
# A tibble: 3,431 × 5
SID age marital gender SRhealth
<dbl> <dbl> <fct> <fct> <dbl>
1 901 4.9 Never Married Female 3
2 2301 5.4 Separated Male 4
3 2302 5.4 Separated Female 3
4 5201 4.5 Married Male 2
5 5202 4.4 Married Female 2
6 5203 1.8 Never Married Female 4
7 5303 3.1 Never Married Male 5
8 1077202 2.6 Never Married Male 3
9 7301 7 Married Male 3
10 7302 6.2 Married Female 2
# … with 3,421 more rows
stat_halfeye()
: Visual Boundariesstat_eye()
: Visual Boundariesstat_gradientinterval()
: Visual Semioticsstat_dots()
: Frequency Framingstat_dotsinterval()
: Frequency Framingstat_halfeye()
+ stat_dots()
: Visual Boundaries + Frequency Framinggeom
stat_halfeye()
We can pull the predictions from model terms or marginal means
Let’s do a little hack and create our whole plots except the geom
, so that we can build them with less syntax:
stat_halfeye()
We can pull the predictions from model terms or marginal means
stat_eye()
stat_gradientinterval()
Model Terms:
stat_dots()
stat_dots()
You can also change the number of dots:
Model Terms:
stat_dots()
There are also three different layouts
layout = "bin"
:
layout = "weave"
:
stat_dotsinterval()
Model Terms:
stat_dotsinterval()
You can apply many of the same arguments as “regular” stat_dots()
Model Terms:
stat_halfeye()
+ stat_dots()
Model Terms:
p1 +
stat_dots(
aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
, quantiles = 50
, side = "bottomleft"
, layout = "swarm"
) +
stat_halfeye(
aes(xdist = dist_student_t(df = df.residual(m1), mu = estimate, sigma = std.error))
) +
labs(subtitle = "`stat_halfeye()`+ `stat_dots()")
Marginal Means:
p2 +
stat_dots(
aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit))
, quantiles = 50
, side = "bottomleft"
, layout = "swarm"
) +
stat_halfeye(
aes(xdist = dist_student_t(df = df.residual(m1), mu = .fitted, sigma = .se.fit))
) +
labs(subtitle = "`stat_halfeye()`+ `stat_dots()")
Let’s say we want to know if married and unmarried people differ in their self-rated health as a function of their age:
# A tibble: 2,979 × 5
SID age marital gender SRhealth
<dbl> <dbl> <fct> <fct> <dbl>
1 901 4.9 Never Married Female 3
2 5201 4.5 Married Male 2
3 5202 4.4 Married Female 2
4 5203 1.8 Never Married Female 4
5 5303 3.1 Never Married Male 5
6 1077202 2.6 Never Married Male 3
7 7301 7 Married Male 3
8 7302 6.2 Married Female 2
9 10101 6.2 Married Male 2
10 12201 5.8 Married Male 3
# … with 2,969 more rows
# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 4.14 0.0751 55.1 0
2 age -0.215 0.0153 -14.1 1.64e-43
3 maritalNever Married 0.0834 0.122 0.683 4.95e- 1
4 age:maritalNever Married -0.0296 0.0334 -0.887 3.75e- 1
We can plot this using an extension of geom_ribbon()
, stat_lineribbon()
gsoep_ex2 %>%
group_by(marital) %>%
data_grid(age= seq_range(age, n = 101)) %>%
ungroup() %>%
augment(m2, newdata = ., se_fit = T) %>%
ggplot(aes(x = age*10, fill = ordered(marital), color = ordered(marital))) +
stat_lineribbon(
aes(ydist = dist_student_t(df = df.residual(m2), mu = .fitted, sigma = .se.fit)),
alpha = 1/4) +
scale_fill_brewer(palette = "Set2") +
scale_color_brewer(palette = "Dark2") +
labs(x = "Age (Years)", y = "Predicted Self-Rated Health"
, fill = "Marital Status", color = "Marital Status") +
my_theme()
R
that get it for you, in my opinion)But first, let’s get some longitudinal data that let’s us look at the interaction between marital status and changes in self-rated health within a person as they age:
set.seed(5)
gsoep_ex3 <- gsoep %>%
select(SID, age, marital, gender, SRhealth) %>%
filter(marital %in% c(1,4)) %>%
group_by(SID) %>%
mutate(marital = min(marital, na.rm = T)) %>%
ungroup() %>%
mutate(marital = factor(
marital
, c(1,4)
, c("Married", "Never Married")
), age = age/10
, gender = factor(gender, c(1,2), c("Male", "Female"))) %>%
drop_na()
gsoep_ex3 <- gsoep_ex3 %>%
filter(SID %in% sample(unique(gsoep_ex3$SID), 2000))
gsoep_ex3
# A tibble: 11,915 × 5
SID age marital gender SRhealth
<dbl> <dbl> <fct> <fct> <dbl>
1 7401 4.6 Married Male 3
2 714802 3.5 Married Male 2
3 17602 6.5 Married Female 2
4 20501 4.4 Married Male 4
5 22302 3.9 Married Female 4
6 24204 2.9 Married Female 5
7 24302 5.2 Married Male 4
8 29702 3.7 Married Female 4
9 32001 5.5 Married Male 4
10 32103 2.4 Married Male 5
# … with 11,905 more rows
The critical term is the interaction between the two:
# A tibble: 8 × 6
effect group term estimate std.error statistic
<chr> <chr> <chr> <dbl> <dbl> <dbl>
1 fixed <NA> (Intercept) 4.58 0.0585 78.4
2 fixed <NA> age -0.241 0.0126 -19.1
3 fixed <NA> maritalNever Married 0.119 0.0910 1.30
4 fixed <NA> age:maritalNever Married -0.0629 0.0271 -2.32
5 ran_pars SID sd__(Intercept) 0.831 NA NA
6 ran_pars SID cor__(Intercept).age -0.728 NA NA
7 ran_pars SID sd__age 0.181 NA NA
8 ran_pars Residual sd__Observation 0.617 NA NA
Changes in health differ across marital groups
predIntlme4 <- function(m, mod_frame, ref){
## get bootstrapped estimates
b <- bootMer(
m
, FUN = function(x) lme4:::predict.merMod(
x
, newdata = mod_frame
, re.form = ref
)
, nsim = 100 # do not use 100 in practice, please
, parallel = "multicore"
, ncpus = 16
)
## get long form bootstrapped draws
b_df <- bind_cols(
mod_frame
, t(b$t) %>%
data.frame()
) %>%
pivot_longer(
cols = c(-age, -marital)
, names_to = "boot"
, values_to = "pred"
)
return(list(boot = b, b_df = b_df))
}
# A tibble: 15,000 × 4
age marital boot pred
<dbl> <chr> <chr> <dbl>
1 1.7 Married X1 4.19
2 1.7 Married X2 4.17
3 1.7 Married X3 4.19
4 1.7 Married X4 4.20
5 1.7 Married X5 4.19
6 1.7 Married X6 4.17
7 1.7 Married X7 4.14
8 1.7 Married X8 4.19
9 1.7 Married X9 4.16
10 1.7 Married X10 4.16
# … with 14,990 more rows
geom_line()
geom_lineribbon()
: summarized# A tibble: 150 × 8
marital age pred .lower .upper .width .point .interval
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 Married 1.7 4.17 4.10 4.23 0.95 median qi
2 Married 1.8 4.14 4.07 4.21 0.95 median qi
3 Married 1.9 4.12 4.05 4.18 0.95 median qi
4 Married 2 4.10 4.03 4.16 0.95 median qi
5 Married 2.1 4.07 4.01 4.13 0.95 median qi
6 Married 2.2 4.05 3.99 4.11 0.95 median qi
7 Married 2.3 4.02 3.96 4.08 0.95 median qi
8 Married 2.4 4.00 3.94 4.06 0.95 median qi
9 Married 2.5 3.98 3.92 4.03 0.95 median qi
10 Married 2.6 3.95 3.90 4.01 0.95 median qi
# … with 140 more rows
geom_lineribbon()
: summarizedgeom_lineribbon()
bands: summarizedstat_lineribbon()
bands: samplesstat_lineribbon()
bands: samplesgeom_lineribbon()
gradient: samplesgeom_lineribbon()
gradient: samplesms <- b_df3 %>% filter(age == max(age)) %>% group_by(marital) %>% summarize(m = mean(pred))
b_df3 %>%
ggplot(aes(x = age*10, y = pred, fill = marital, fill_ramp = stat(.width))) +
stat_lineribbon(
alpha = .25
, .width = ppoints(25)
) +
scale_x_continuous(limits = c(15,100), breaks = seq(15,90,15)) +
scale_fill_manual(values = c("grey", "darkorange")) +
annotate(
"text", label = "Married", x = max(b_df3$age)*10+1
, y = ms$m[1], hjust = 0
) +
annotate(
"text", label = "Never\nMarried", x = max(b_df3$age)*10+1
, y = ms$m[2], hjust = 0
) +
labs(
x = "Age (Years)"
, y = "Predicted Self Rated Health\n(Bootstrapped Interval Estimates)"
, fill = NULL
, title = "Self-Rated Health Declines More Rapidly\nfor Unmarried People"
) +
guides(fill = "none") +
my_theme()
gganimate
package to do thisHere’s a quick example:
Let’s break this down:
# A tibble: 150 × 8
marital age pred .lower .upper .width .point .interval
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 Married 1.7 4.17 4.10 4.23 0.95 median qi
2 Married 1.8 4.14 4.07 4.21 0.95 median qi
3 Married 1.9 4.12 4.05 4.18 0.95 median qi
4 Married 2 4.10 4.03 4.16 0.95 median qi
5 Married 2.1 4.07 4.01 4.13 0.95 median qi
6 Married 2.2 4.05 3.99 4.11 0.95 median qi
7 Married 2.3 4.02 3.96 4.08 0.95 median qi
8 Married 2.4 4.00 3.94 4.06 0.95 median qi
9 Married 2.5 3.98 3.92 4.03 0.95 median qi
10 Married 2.6 3.95 3.90 4.01 0.95 median qi
# … with 140 more rows
Let’s break this down: First, we’ll plot just the ribbons across the full sample based on the bootstrapped predictions
Let’s break this down: Next we’ll add a line for each bootstrapped sample. This doesn’t look great here.
b_df3 %>%
group_by(marital, age) %>%
median_qi(pred) %>%
ggplot(aes(x = age, y = pred)) +
geom_ribbon(
aes(fill = marital, ymin = .lower, ymax = .upper)
, size = .9, alpha = .5
) +
scale_fill_brewer(palette = "Set2") +
geom_line(
data = b_df3
, aes(group = interaction(marital, boot))
, size = 1
) +
facet_grid(~marital) +
my_theme() +
theme(legend.position = "none")
Let’s break this down: But when we use the transition_states()
function across the bootstraped samples, we can get a better sense about where possible regression lines fall:
b_df3 %>%
group_by(marital, age) %>%
median_qi(pred) %>%
ggplot(aes(x = age, y = pred)) +
geom_ribbon(
aes(fill = marital, ymin = .lower, ymax = .upper)
, size = .9, alpha = .5
) +
scale_fill_brewer(palette = "Set2") +
geom_line(
data = b_df3
, aes(group = interaction(marital, boot))
, size = 1
) +
facet_grid(~marital) +
my_theme() +
theme(legend.position = "none") +
transition_states(boot, 1, 1)
ungeviz
package, we can then use the geom_vpline()
function to sample from the data across groups and plot the mean from different samples:And finally, we can animate those samples the transition_states()
function from the gganimate
package again: