Chapter 2 Background on the SAPA Project

2.1 Packages

library(qgraph)
library(bootnet)
library(ggplot2)
library(psych)
library(RColorBrewer)
library(knitr)
library(kableExtra)
library(parallel)
library(broom)
library(igraph)
library(gridExtra)
library(data.table)
library(plyr)
library(tidyverse)

2.2 Methods

All of the data being pulled in have previously been cleaned. There are two big files — one from 2006 to 2010 and a second from 2010 to now.

2.2.1 2006 to 2010

The older data set is based on the predecessor platform to SAPA (a 50-item survey, created by William Revelle and hosted at www.personalityproject.org). There are a couple of issues with this sample. First, there was much less demographic information collected at that time (age, gender, ethnicity, education, and a few others). Second, it only sampled 50 items from the IPIP100 (each participant got 2 of 4 25-item blocks) and a preliminary set of ICAR items, most of which have since been deprecated. Third, there were two technical problems that caused a complete lack of data collection for one of the IPIP items (q_55) and a partial lack of data collection for two others (q_316 and q_995). Still, these data remain quite useful in that they include responses on 99 of the 100 IPIP items from about 114k participants.

The following code chunk pulls in the first sample and describes it. Note that only 99 variables are listed because item q_55 is empty.

data_path <- "https://github.com/emoriebeck/age_nets/blob/master"
load(url(sprintf("%s/data/SAPAdata4apr2006to18aug2010.rdata?raw=true", data_path)))

dim(taFinal)  # Participants who did not answer any IPIP100 items removed.

## [1] 114016    141

# Identify Ps who didnt answer any IPIP100 items
empty <- rowSums(subset(taFinal, select = c(q_76:q_1989)), na.rm = TRUE)
taFinal <- taFinal[empty > 0, ]
dim(taFinal)  # Participants who did not answer any IPIP100 items removed.

## [1] 113961    141

describe(subset(taFinal, select = c(q_76:q_1989)), fast = TRUE)

##        vars     n mean   sd min max range   se
## q_76      1 56860 4.20 1.27   1   6     5 0.01
## q_108     2 56739 3.29 1.45   1   6     5 0.01
## q_124     3 56014 4.48 1.22   1   6     5 0.01
## q_128     4 55964 4.81 1.13   1   6     5 0.00
## q_132     5 57010 4.77 1.09   1   6     5 0.00
## q_140     6 56719 3.41 1.65   1   6     5 0.01
## q_146     7 56131 2.30 1.34   1   6     5 0.01
## q_150     8 56781 4.89 1.16   1   6     5 0.00
## q_177     9 56719 3.41 1.49   1   6     5 0.01
## q_194    10 56665 2.47 1.33   1   6     5 0.01
## q_195    11 56813 2.52 1.36   1   6     5 0.01
## q_200    12 56783 2.29 1.29   1   6     5 0.01
## q_217    13 57159 4.73 1.22   1   6     5 0.01
## q_240    14 56685 4.86 1.04   1   6     5 0.00
## q_241    15 56739 3.95 1.57   1   6     5 0.01
## q_248    16 56715 4.05 1.34   1   6     5 0.01
## q_254    17 57135 4.33 1.37   1   6     5 0.01
## q_262    18 56926 3.28 1.51   1   6     5 0.01
## q_316    19  5723 2.71 1.54   1   6     5 0.02
## q_403    20 57115 3.83 1.54   1   6     5 0.01
## q_422    21 57012 4.66 1.18   1   6     5 0.00
## q_492    22 55996 4.42 1.19   1   6     5 0.01
## q_493    23 57076 4.97 1.05   1   6     5 0.00
## q_497    24 56707 3.52 1.53   1   6     5 0.01
## q_530    25 56043 4.27 1.30   1   6     5 0.01
## q_609    26 56727 2.14 1.33   1   6     5 0.01
## q_619    27 56025 4.30 1.27   1   6     5 0.01
## q_626    28 56009 2.65 1.38   1   6     5 0.01
## q_690    29 56718 3.84 1.48   1   6     5 0.01
## q_698    30 56739 3.74 1.62   1   6     5 0.01
## q_712    31 56031 2.94 1.60   1   6     5 0.01
## q_815    32 57031 4.33 1.32   1   6     5 0.01
## q_819    33 56746 4.35 1.35   1   6     5 0.01
## q_838    34 56689 2.18 1.35   1   6     5 0.01
## q_844    35 56669 4.68 1.22   1   6     5 0.01
## q_890    36 57094 2.89 1.49   1   6     5 0.01
## q_901    37 56065 3.24 1.56   1   6     5 0.01
## q_904    38 57159 3.23 1.55   1   6     5 0.01
## q_931    39 56644 3.92 1.48   1   6     5 0.01
## q_952    40 56020 2.92 1.52   1   6     5 0.01
## q_960    41 57004 3.71 1.42   1   6     5 0.01
## q_962    42 56794 3.34 1.54   1   6     5 0.01
## q_974    43 56025 3.50 1.50   1   6     5 0.01
## q_979    44 57034 3.64 1.59   1   6     5 0.01
## q_986    45 56862 3.73 1.56   1   6     5 0.01
## q_995    46  5795 3.27 1.53   1   6     5 0.02
## q_1020   47 56943 3.51 1.39   1   6     5 0.01
## q_1041   48 57064 4.23 1.28   1   6     5 0.01
## q_1050   49 56791 4.36 1.30   1   6     5 0.01
## q_1053   50 56750 4.71 1.27   1   6     5 0.01
## q_1058   51 56700 4.84 1.20   1   6     5 0.01
## q_1083   52 56637 1.97 1.15   1   6     5 0.00
## q_1088   53 56773 2.49 1.32   1   6     5 0.01
## q_1090   54 56684 4.74 1.00   1   6     5 0.00
## q_1099   55 56135 3.23 1.58   1   6     5 0.01
## q_1114   56 56753 2.61 1.46   1   6     5 0.01
## q_1162   57 55998 4.83 1.13   1   6     5 0.00
## q_1163   58 56879 2.45 1.44   1   6     5 0.01
## q_1180   59 56738 3.35 1.45   1   6     5 0.01
## q_1205   60 56018 3.99 1.32   1   6     5 0.01
## q_1206   61 55916 4.59 1.28   1   6     5 0.01
## q_1254   62 57066 3.76 1.76   1   6     5 0.01
## q_1255   63 56782 3.37 1.67   1   6     5 0.01
## q_1290   64 56684 4.43 1.33   1   6     5 0.01
## q_1333   65 57010 4.08 1.50   1   6     5 0.01
## q_1364   66 56038 4.67 1.45   1   6     5 0.01
## q_1374   67 57062 4.16 1.38   1   6     5 0.01
## q_1385   68 57084 4.95 1.09   1   6     5 0.00
## q_1388   69 57056 4.01 1.60   1   6     5 0.01
## q_1392   70 57003 4.58 1.20   1   6     5 0.01
## q_1397   71 56715 2.73 1.41   1   6     5 0.01
## q_1410   72 56209 4.39 1.44   1   6     5 0.01
## q_1419   73 56077 4.61 1.18   1   6     5 0.00
## q_1422   74 57075 4.21 1.31   1   6     5 0.01
## q_1452   75 56685 2.28 1.31   1   6     5 0.01
## q_1479   76 55972 3.15 1.53   1   6     5 0.01
## q_1480   77 56834 3.06 1.50   1   6     5 0.01
## q_1483   78 56770 3.27 1.67   1   6     5 0.01
## q_1505   79 56005 3.00 1.59   1   6     5 0.01
## q_1507   80 56784 4.74 1.16   1   6     5 0.00
## q_1585   81 56738 3.24 1.45   1   6     5 0.01
## q_1677   82 56695 3.36 1.49   1   6     5 0.01
## q_1683   83 56629 3.61 1.52   1   6     5 0.01
## q_1696   84 56533 2.61 1.34   1   6     5 0.01
## q_1705   85 57010 4.99 0.98   1   6     5 0.00
## q_1738   86 56067 4.90 1.14   1   6     5 0.00
## q_1742   87 56672 4.27 1.39   1   6     5 0.01
## q_1763   88 56749 4.85 1.15   1   6     5 0.00
## q_1768   89 56005 4.33 1.31   1   6     5 0.01
## q_1775   90 57002 3.32 1.48   1   6     5 0.01
## q_1792   91 56656 4.65 1.12   1   6     5 0.00
## q_1803   92 56724 3.65 1.68   1   6     5 0.01
## q_1832   93 57047 4.30 1.25   1   6     5 0.01
## q_1861   94 56664 2.66 1.42   1   6     5 0.01
## q_1893   95 56675 3.64 1.49   1   6     5 0.01
## q_1913   96 57047 3.09 1.40   1   6     5 0.01
## q_1949   97 56037 3.44 1.56   1   6     5 0.01
## q_1964   98 55945 2.51 1.29   1   6     5 0.01
## q_1989   99 56748 4.36 1.40   1   6     5 0.01

# table by gender
table(taFinal$gender)

## 
##   Male Female 
##  37864  76097

# describe the age distribution
describe(taFinal$age, fast = TRUE)

##    vars      n mean    sd min max range   se
## X1    1 113961 27.3 11.17  14  90    76 0.03

# how many countries are represented?
length(table(taFinal$country))

## [1] 204

# show the top 25 countries
sort(table(taFinal$country), decreasing = TRUE)[1:25]

## 
##   USA   CAN   GBR   AUS   IND   MYS   PHL   CHN   DEU   SGP   SWE   MEX 
## 83110  5661  4025  3647  1795   844   756   645   628   617   574   556 
##   POL   NLD   IRL   NZL   ZAF   KOR   HKG   BRA   NOR   FRA   ROU   PAK 
##   473   453   417   398   398   352   343   317   314   304   301   235 
##   ITA 
##   215

# show the US states, in order
sort(table(taFinal$state), decreasing = TRUE)[1:51]

## 
##           California             Illinois             New York 
##                 9709                 5520                 4942 
##         Pennsylvania                Texas                 Ohio 
##                 4758                 4662                 3600 
##              Florida             Virginia             Michigan 
##                 2936                 2787                 2549 
##           New Jersey              Georgia            Wisconsin 
##                 2495                 2414                 2377 
##            Minnesota            Louisiana        Massachusetts 
##                 2104                 2030                 1935 
##             Maryland           Washington              Indiana 
##                 1772                 1742                 1707 
##             Missouri       North Carolina               Oregon 
##                 1611                 1454                 1203 
##           New Mexico            Tennessee             Colorado 
##                 1199                 1133                 1097 
##       South Carolina          Connecticut                 Iowa 
##                 1010                  986                  982 
##              Arizona             Kentucky               Kansas 
##                  866                  820                  808 
##             Oklahoma              Alabama          Mississippi 
##                  771                  643                  604 
##             Delaware             Nebraska             Arkansas 
##                  592                  580                  577 
##               Alaska                 Utah         Rhode Island 
##                  555                  487                  422 
##        New Hampshire        West Virginia                Maine 
##                  389                  384                  356 
##                Idaho               Hawaii               Nevada 
##                  340                  292                  274 
##              Montana         North Dakota District of Columbia 
##                  243                  190                  174 
##         South Dakota              Vermont              Wyoming 
##                  172                  161                  126

# table by ethnicity
table(taFinal$ethnic)

## < table of extent 0 >

# table by education
table(taFinal$education)

## 
##              Less than 12 years            High school graduate 
##                           15052                            9037 
##   Some college did not graduate     Currently attending college 
##                           10736                           42080 
##                College graduate Graduate or professional degree 
##                           18296                           18760

2.2.2 2010 to 2017}

From August 18, 2010 to February 7, 2017, the data were collected from the SAPA-Project.org. While many more variables (including many more demographic variables) were collected in this sample, the data are pared down to match the older set. It should also be noted that large subsets of these data have previously been placed in the public domain (see https://dataverse.harvard.edu/dataverse/SAPA-Project).

Here, we pull in the second sample, subset it, and describe the result.

load(url(sprintf("%s/data/SAPAdata18aug2010thru7feb2017.rdata?raw=true", data_path)))

dim(SAPAdata18aug2010thru7feb2017)  # Participants who did not answer any IPIP100 items removed.

## [1] 255348    953

# Identify Ps who didnt answer any IPIP100 items
empty <- rowSums(subset(SAPAdata18aug2010thru7feb2017, select = c(q_55:q_1989)), 
    na.rm = TRUE)
SAPAdata18aug2010thru7feb2017 <- SAPAdata18aug2010thru7feb2017[empty > 0, ]
dim(SAPAdata18aug2010thru7feb2017)  # Participants who did not answer any IPIP100 items removed.

## [1] 255190    953

SAPAdata18aug2010thru7feb2017 <- SAPAdata18aug2010thru7feb2017 %>% select(one_of(c("q_55", 
    colnames(taFinal)))) %>% select(RID:education, q_55, everything())

describe(subset(SAPAdata18aug2010thru7feb2017, select = c(q_55:q_1989)), fast = TRUE)

##        vars     n mean   sd min max range   se
## q_55      1 59248 4.05 1.49   1   6     5 0.01
## q_76      2 59282 4.18 1.26   1   6     5 0.01
## q_108     3 57880 3.21 1.45   1   6     5 0.01
## q_124     4 70237 4.45 1.18   1   6     5 0.00
## q_128     5 69812 4.77 1.14   1   6     5 0.00
## q_132     6 69751 4.64 1.15   1   6     5 0.00
## q_140     7 59834 3.41 1.63   1   6     5 0.01
## q_146     8 68259 2.38 1.36   1   6     5 0.01
## q_150     9 68308 4.81 1.19   1   6     5 0.00
## q_177    10 58340 3.44 1.49   1   6     5 0.01
## q_194    11 59219 2.43 1.31   1   6     5 0.01
## q_195    12 68069 2.61 1.38   1   6     5 0.01
## q_200    13 68525 2.41 1.33   1   6     5 0.01
## q_217    14 59074 4.70 1.21   1   6     5 0.00
## q_240    15 69644 4.82 1.05   1   6     5 0.00
## q_241    16 59522 4.01 1.58   1   6     5 0.01
## q_248    17 58209 4.00 1.37   1   6     5 0.01
## q_254    18 60182 4.21 1.41   1   6     5 0.01
## q_262    19 59494 3.18 1.54   1   6     5 0.01
## q_316    20 69979 2.70 1.54   1   6     5 0.01
## q_403    21 59382 3.98 1.53   1   6     5 0.01
## q_422    22 69499 4.61 1.21   1   6     5 0.00
## q_492    23 69740 4.41 1.20   1   6     5 0.00
## q_493    24 69509 4.91 1.06   1   6     5 0.00
## q_497    25 58027 3.44 1.52   1   6     5 0.01
## q_530    26 70916 4.31 1.27   1   6     5 0.00
## q_609    27 59701 2.19 1.36   1   6     5 0.01
## q_619    28 70528 4.31 1.25   1   6     5 0.00
## q_626    29 59498 2.59 1.35   1   6     5 0.01
## q_690    30 59601 3.96 1.48   1   6     5 0.01
## q_698    31 70353 3.54 1.61   1   6     5 0.01
## q_712    32 59897 3.05 1.61   1   6     5 0.01
## q_815    33 59519 4.20 1.36   1   6     5 0.01
## q_819    34 59304 4.27 1.39   1   6     5 0.01
## q_838    35 68245 2.35 1.46   1   6     5 0.01
## q_844    36 68357 4.68 1.23   1   6     5 0.00
## q_890    37 58185 2.89 1.47   1   6     5 0.01
## q_901    38 59862 3.27 1.58   1   6     5 0.01
## q_904    39 59520 3.21 1.54   1   6     5 0.01
## q_931    40 70518 4.02 1.45   1   6     5 0.01
## q_952    41 58427 2.92 1.52   1   6     5 0.01
## q_960    42 58032 3.70 1.44   1   6     5 0.01
## q_962    43 59196 3.37 1.54   1   6     5 0.01
## q_974    44 58193 3.49 1.49   1   6     5 0.01
## q_979    45 58197 3.62 1.59   1   6     5 0.01
## q_986    46 58344 3.79 1.57   1   6     5 0.01
## q_995    47 58004 3.25 1.52   1   6     5 0.01
## q_1020   48 58039 3.42 1.39   1   6     5 0.01
## q_1041   49 68142 4.30 1.26   1   6     5 0.00
## q_1050   50 69752 4.34 1.29   1   6     5 0.00
## q_1053   51 68453 4.70 1.26   1   6     5 0.00
## q_1058   52 59526 4.81 1.21   1   6     5 0.00
## q_1083   53 59048 2.00 1.17   1   6     5 0.00
## q_1088   54 69366 2.48 1.30   1   6     5 0.00
## q_1090   55 69251 4.67 1.03   1   6     5 0.00
## q_1099   56 58276 3.19 1.56   1   6     5 0.01
## q_1114   57 59641 2.72 1.47   1   6     5 0.01
## q_1162   58 68437 4.79 1.16   1   6     5 0.00
## q_1163   59 58733 2.40 1.41   1   6     5 0.01
## q_1180   60 59365 3.47 1.47   1   6     5 0.01
## q_1205   61 69960 3.92 1.34   1   6     5 0.01
## q_1206   62 68167 4.57 1.28   1   6     5 0.00
## q_1254   63 59382 3.71 1.73   1   6     5 0.01
## q_1255   64 59350 3.33 1.64   1   6     5 0.01
## q_1290   65 70597 4.49 1.27   1   6     5 0.00
## q_1333   66 69656 4.15 1.46   1   6     5 0.01
## q_1364   67 68534 4.63 1.47   1   6     5 0.01
## q_1374   68 70631 4.25 1.33   1   6     5 0.01
## q_1385   69 68461 5.01 1.09   1   6     5 0.00
## q_1388   70 69687 3.98 1.58   1   6     5 0.01
## q_1392   71 69585 4.58 1.20   1   6     5 0.00
## q_1397   72 59299 2.82 1.42   1   6     5 0.01
## q_1410   73 59784 4.30 1.47   1   6     5 0.01
## q_1419   74 68562 4.61 1.16   1   6     5 0.00
## q_1422   75 59488 4.27 1.29   1   6     5 0.01
## q_1452   76 59530 2.31 1.32   1   6     5 0.01
## q_1479   77 58257 3.17 1.53   1   6     5 0.01
## q_1480   78 59220 3.11 1.51   1   6     5 0.01
## q_1483   79 59564 3.29 1.64   1   6     5 0.01
## q_1505   80 58229 3.05 1.60   1   6     5 0.01
## q_1507   81 70717 4.80 1.13   1   6     5 0.00
## q_1585   82 58116 3.23 1.44   1   6     5 0.01
## q_1677   83 58010 3.27 1.49   1   6     5 0.01
## q_1683   84 58091 3.62 1.50   1   6     5 0.01
## q_1696   85 58914 2.60 1.34   1   6     5 0.01
## q_1705   86 68148 4.99 1.01   1   6     5 0.00
## q_1738   87 59777 4.93 1.13   1   6     5 0.00
## q_1742   88 59128 4.15 1.43   1   6     5 0.01
## q_1763   89 68336 4.83 1.18   1   6     5 0.00
## q_1768   90 70279 4.33 1.31   1   6     5 0.00
## q_1775   91 58307 3.19 1.48   1   6     5 0.01
## q_1792   92 68363 4.68 1.14   1   6     5 0.00
## q_1803   93 59615 3.55 1.70   1   6     5 0.01
## q_1832   94 68502 4.43 1.25   1   6     5 0.00
## q_1861   95 59797 2.72 1.43   1   6     5 0.01
## q_1893   96 59820 3.73 1.46   1   6     5 0.01
## q_1913   97 70046 3.08 1.41   1   6     5 0.01
## q_1949   98 59504 3.39 1.57   1   6     5 0.01
## q_1964   99 59480 2.46 1.25   1   6     5 0.01
## q_1989  100 58160 4.43 1.40   1   6     5 0.01

# table by gender
table(SAPAdata18aug2010thru7feb2017$gender)

## 
##   male female 
##  95169 160020

# describe the age distribution
describe(SAPAdata18aug2010thru7feb2017$age, fast = TRUE)

##    vars      n  mean    sd min max range   se
## X1    1 255190 25.94 10.73  14  90    76 0.02

# how many countries are represented?
length(table(SAPAdata18aug2010thru7feb2017$country))

## [1] 220

# show the top 25 countries
sort(table(SAPAdata18aug2010thru7feb2017$country), decreasing = TRUE)[1:25]

## 
##    USA    CAN    GBR    AUS    IND    SWE    DEU    PHL    MYS    CHN 
## 168816  10581   7214   4790   2782   2425   2346   2239   2151   1441 
##    NLD    SGP    NOR    NZL    MEX    FRA    BRA    ZAF    FIN    IRL 
##   1301   1269   1170   1033    984    844    830    793    772    673 
##    IDN    POL    ROU    HKG    ITA 
##    646    641    623    576    568

# show the US states, in order
state <- subset(SAPAdata18aug2010thru7feb2017, country == "USA")
sort(table(state$state), decreasing = TRUE)[1:51]

## 
##           California             Illinois                Texas 
##                18062                11611                10367 
##             New York              Florida         Pennsylvania 
##                 8882                 8054                 7919 
##             Michigan                 Ohio             Virginia 
##                 7660                 6213                 5763 
##              Georgia           Washington            Minnesota 
##                 5308                 4753                 4090 
##       North Carolina            Wisconsin           New Jersey 
##                 3607                 3394                 3290 
##        Massachusetts              Indiana             Maryland 
##                 3147                 2938                 2689 
##             Missouri              Arizona            Louisiana 
##                 2567                 2495                 2487 
##       South Carolina           New Mexico               Oregon 
##                 2479                 2223                 2213 
##            Tennessee             Delaware              Alabama 
##                 2135                 1929                 1865 
##             Colorado             Kentucky             Oklahoma 
##                 1816                 1757                 1643 
##                 Iowa          Connecticut             Nebraska 
##                 1441                 1355                 1348 
##                 Utah               Kansas          Mississippi 
##                 1309                 1165                  978 
##             Arkansas         Rhode Island               Nevada 
##                  925                  825                  763 
##                Idaho        New Hampshire               Alaska 
##                  683                  611                  578 
##                Maine              Montana        West Virginia 
##                  576                  553                  512 
##               Hawaii         South Dakota              Vermont 
##                  487                  401                  312 
## District of Columbia         North Dakota              Wyoming 
##                  309                  268                  216

# table by ethnicity
table(SAPAdata18aug2010thru7feb2017$ethnic)

## < table of extent 0 >

# table by education
table(SAPAdata18aug2010thru7feb2017$education)

## 
##         less12yrs            HSgrad     CurrentInUniv       SomeCollege 
##             33011             17517            104783             13006 
##     CollegeDegree InGradOrProSchool   GradOrProDegree 
##             32020             11431             19199

2.2.3 Merge the datasets and Describe}

Now we will combine both of the samples and describe them with proper tables and figures before converting the demographics into numeric variables.

fullSample <- taFinal %>% select(-contains("iq_")) %>% full_join(SAPAdata18aug2010thru7feb2017) %>% 
    select(RID:education, q_55, everything())

rm(SAPAdata18aug2010thru7feb2017, taFinal)
gc()

##             used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells   4507029 240.8   10045189  536.5         NA  10045189  536.5
## Vcells 107160950 817.6  392190552 2992.2      51200 490231665 3740.2

# n is total Ps in the full sample
n <- dim(fullSample)[1]
ntext <- prettyNum(n, big.mark = ",", scientific = F)
ntext

## [1] "369,151"

# percent female
femalePerc <- 100 * round((table(fullSample$gender)/sum(table(fullSample$gender)))["female"], 
    3)
femalePerc

## female 
##   43.3

# describe the age distribution
(fullSampleAge <- psych::describe(fullSample[, "age"]))

##    vars      n  mean    sd median trimmed  mad min max range skew kurtosis
## X1    1 369151 26.36 10.89     22   24.53 7.41  14  90    76 1.48     1.86
##      se
## X1 0.02

(MeanAge <- fullSampleAge$mean)

## [1] 26.36206

(SDAge <- fullSampleAge$sd)

## [1] 10.88535

(MedianAge <- fullSampleAge$median)

## [1] 22

(MinAge <- fullSampleAge$min)

## [1] 14

(MaxAge <- fullSampleAge$max)

## [1] 90

# percent U.S.
(USperc <- 100 * round((table(fullSample$country)/sum(table(fullSample$country)))["USA"], 
    3))

##  USA 
## 72.5

# total U.S.
(UScounttext <- prettyNum((table(fullSample$country))["USA"], big.mark = ",", 
    scientific = F))

##       USA 
## "251,926"

# number of countries with P>0
countryTable <- sort(table(fullSample$country), decreasing = TRUE)
countryCount <- length(countryTable)

# number of countries with P>199
countryCount200 <- subset(countryTable, countryTable > 199)
length(countryCount200)

## [1] 67

# show the top 25 countries
sort(table(fullSample$country), decreasing = TRUE)[1:25]

## 
##    USA    CAN    GBR    AUS    IND    SWE    MYS    PHL    DEU    CHN 
## 251926  16242  11239   8437   4577   2999   2995   2995   2974   2086 
##    SGP    NLD    MEX    NOR    NZL    ZAF    FRA    BRA    POL    IRL 
##   1886   1754   1540   1484   1431   1191   1148   1147   1114   1090 
##    FIN    ROU    HKG    IDN    ITA 
##    963    924    919    847    783

(countrytext <- prettyNum(sum(table(fullSample$country)), big.mark = ",", scientific = F))

## [1] "347,672"

# show the US states, in order
state <- subset(fullSample, country == "USA")
sort(table(state$state), decreasing = TRUE)[1:51]

## 
##           California             Illinois                Texas 
##                27771                17131                15029 
##             New York         Pennsylvania              Florida 
##                13824                12677                10990 
##             Michigan                 Ohio             Virginia 
##                10209                 9813                 8550 
##              Georgia           Washington            Minnesota 
##                 7722                 6495                 6194 
##           New Jersey            Wisconsin        Massachusetts 
##                 5785                 5771                 5082 
##       North Carolina              Indiana            Louisiana 
##                 5061                 4645                 4517 
##             Maryland             Missouri       South Carolina 
##                 4461                 4178                 3489 
##           New Mexico               Oregon              Arizona 
##                 3422                 3416                 3361 
##            Tennessee             Colorado             Kentucky 
##                 3268                 2913                 2577 
##             Delaware              Alabama                 Iowa 
##                 2521                 2508                 2423 
##             Oklahoma          Connecticut               Kansas 
##                 2414                 2341                 1973 
##             Nebraska                 Utah          Mississippi 
##                 1928                 1796                 1582 
##             Arkansas         Rhode Island               Alaska 
##                 1502                 1247                 1133 
##               Nevada                Idaho        New Hampshire 
##                 1037                 1023                 1000 
##                Maine        West Virginia              Montana 
##                  932                  896                  796 
##               Hawaii         South Dakota District of Columbia 
##                  779                  573                  483 
##              Vermont         North Dakota              Wyoming 
##                  473                  458                  342

education <- data.frame(table(fullSample$education))
colnames(education) <- c("Educational Level", "Participants")
(college <- prettyNum(round(education[4, 2]/sum(education[, 2]) * 100, 0), big.mark = ",", 
    scientific = F))

## [1] "12"

ethnic <- data.frame(table(fullSample$race))
colnames(ethnic) <- c("Ethnicity/Race", "Participants")

fullSample <- IPIP100items04apr2006thru7feb2017
fullSample %>% 
  filter(!is.na(country)) %>%
  group_by(country) %>% 
  summarize(n = n()) %>%
  arrange(desc(n)) %>%
  filter(row_number() %in% 1:25) %>%
  kable(., "html", booktabs = T, escape = F,
        caption = "Sample Sizes in Top 25 Countries",
        col.names = c("Country", "Frequency")) %>%
  kable_styling(full_width = F)

Table 2.1: Sample Sizes in Top 25 Countries
Country	Frequency
USA	251913
CAN	16242
GBR	11237
AUS	8437
IND	4576
SWE	2999
MYS	2995
PHL	2994
DEU	2974
CHN	2086
SGP	1885
NLD	1754
MEX	1540
NOR	1484
NZL	1430
ZAF	1191
FRA	1148
BRA	1147
POL	1114
IRL	1090
FIN	963
ROU	924
HKG	919
IDN	847
ITA	782

ethnic %>%
  arrange(desc(Participants)) %>%
  kable(., "html", booktabs = T, escape = F,
        caption = "Race/Ethnicity Among U.S. Participants in the full sample") %>%
  kable_styling(full_width = F)

Table 2.2: Race/Ethnicity Among U.S. Participants in the full sample
Ethnicity/Race	Participants
White/Caucasian	63763
African American	6354
Other	3201
Mexican	2249
Latino	2184
Chinese	1189
Native American	741
Philipino	649
Other Asian	597
Puerto Rican	555
Korean	528
Indian/Pakistani	498
Pacific Islander	329
Japanese	266

levs <- tibble(old = c("less12yrs", "Less than 12 years", "HSgrad", "High school graduate", "SomeCollege", 
                 "Some college did not graduate", "CurrentInUniv", "Currently attending college",
                 "CollegeDegree", "College graduate", "InGradOrProSchool", "Graduate or professional degree",
                 "GradOrProDegree"),
               new = c(rep("Less than 12 years", 2), rep("High school graduate", 2), 
                       rep("Some college/university, but did not graduate", 2),
                       rep("Currently in college/university", 2), rep("College/university degree", 2),
                       "Currently in graduate or professional school", 
                       rep("Graduate or professional school degree", 2)))
education %>%
  mutate(`Educational Level` = mapvalues(`Educational Level`, levs$old, levs$new)) %>%
  group_by(`Educational Level`) %>%
  summarize(Participants = sum(Participants)) %>%
  ungroup() %>%
  mutate(`Educational Level` = factor(`Educational Level`, levels = unique(levs$new))) %>%
  arrange(`Educational Level`) %>%
  kable(., "html", booktabs = T, escape = F,
        caption = "Educational Attainment Level in the full sample") %>%
  kable_styling(full_width = F)

Table 2.3: Educational Attainment Level in the full sample
Educational Level	Participants
Less than 12 years	48063
High school graduate	26554
Some college/university, but did not graduate	23742
Currently in college/university	146863
College/university degree	50316
Currently in graduate or professional school	11431
Graduate or professional school degree	37959

It’s also useful to describe the extent of the missingness in the data (for the IPIP100 variables) and the number of pairwise administrations across the items. While we’re at it, we’ll also bootstrap estimates of the standard errors of the item-level correlations.

items <- colnames(subset(fullSample, select = c(q_55:q_1989)))

not_na_count <- apply(fullSample[,items], 1, function(x) sum(!is.na(x)))

partAdminsMean <- round(mean(not_na_count), 0)
partAdminsMean <- prettyNum(partAdminsMean, big.mark=",", scientific=F)

partAdminsMedian <- round(median(not_na_count), 0)
partAdminsMedian <- prettyNum(partAdminsMedian, big.mark=",", scientific=F)

partAdminsSD <- round(sd(not_na_count), 0)
partAdminsSD <- prettyNum(partAdminsSD, big.mark=",", scientific=F)

described <- describe(fullSample[,items])

AdminsMean <- round(mean(described$n), 0)
AdminsMean <- prettyNum(AdminsMean, big.mark=",", scientific=F)

AdminsMedian <- round(median(described$n), 0)
AdminsMedian <- prettyNum(AdminsMedian, big.mark=",", scientific=F)

AdminsSD <- round(sd(described$n), 0)
AdminsSD <- prettyNum(AdminsSD, big.mark=",", scientific=F)

# The mean, median and sd of administrations of items
AdminsMean
AdminsMedian
AdminsSD
rm(described)

pwiseAdmins1 <- pairwiseDescribe(fullSample[,items])
pwiseAdmins <- prettyNum(pwiseAdmins1, big.mark=",", scientific=F)

pwiseAdminsMean <- round(pwiseAdmins1$mean, 0)
pwiseAdminsMean <- prettyNum(pwiseAdminsMean, big.mark=",", scientific=F)

pwiseAdminsMedian <- round(pwiseAdmins1$median, 0)
pwiseAdminsMedian <- prettyNum(pwiseAdminsMedian, big.mark=",", scientific=F)

pwiseAdminsSD <- round(pwiseAdmins1$sd, 0)
pwiseAdminsSD <- prettyNum(pwiseAdminsSD, big.mark=",", scientific=F)

pwiseAdminsMin <- round(pwiseAdmins1$min, 0)
pwiseAdminsMin <- prettyNum(pwiseAdminsMin, big.mark=",", scientific=F)

# The mean, median and sd of pairwise administrations of items
pwiseAdminsMean
pwiseAdminsMedian
pwiseAdminsSD

#rm(pwiseAdmins1)

# the CIs around the rs
IPIP100CorCI <- cor.ci(fullSample[,items], p=.05, n.iter = 100, overlap=FALSE, plot = FALSE, poly = FALSE)
save(IPIP100CorCI, file = "~/Downloads/cor_stab.RData")

load(url(sprintf("%s/results/cor_stab.RData?raw=true", data_path)))
plot(density(IPIP100CorCI$sds), main="", ylab="Frequency", xlab="Standard error of item-level correlations", xlim= c(0,.02), ylim = c(0,1000))

Figure 2.1: Supplementary Figure 2. Standard error of item-level correlations

Clean up some of the helper objects and save the data for further analysis later.

IPIP100items04apr2006thru7feb2017 <- fullSample

ItemInfo100 <- ItemInfo[items,1:2]

keys.list.trimmed <- keys.list[c("IPIP100agreeableness20","IPIP100conscientiousness20", "IPIP100extraversion20", "IPIP100intellect20", "IPIP100EmotionalStability20")]

ItemLists.trimmed <- ItemLists[c("IPIP100agreeableness20","IPIP100conscientiousness20", "IPIP100extraversion20", "IPIP100intellect20", "IPIP100EmotionalStability20")]

save(IPIP100items04apr2006thru7feb2017, ItemInfo100, ItemLists.trimmed, keys.list.trimmed, file=paste(filepathdata, "IPIP100items04apr2006thru7feb2017.rdata", sep=""))