Author

Emorie D Beck

Loading required package: Matrix
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ ggplot2::%+%()      masks psych::%+%()
✖ ggplot2::alpha()    masks psych::alpha()
✖ dplyr::arrange()    masks plyr::arrange()
✖ purrr::compact()    masks plyr::compact()
✖ dplyr::count()      masks plyr::count()
✖ dplyr::desc()       masks plyr::desc()
✖ tidyr::expand()     masks Matrix::expand()
✖ dplyr::failwith()   masks plyr::failwith()
✖ dplyr::filter()     masks stats::filter()
✖ dplyr::group_rows() masks kableExtra::group_rows()
✖ dplyr::id()         masks plyr::id()
✖ dplyr::lag()        masks stats::lag()
✖ dplyr::mutate()     masks plyr::mutate()
✖ tidyr::pack()       masks Matrix::pack()
✖ dplyr::rename()     masks plyr::rename()
✖ dplyr::summarise()  masks plyr::summarise()
✖ dplyr::summarize()  masks plyr::summarize()
✖ tidyr::unpack()     masks Matrix::unpack()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Outline

  1. Questions on Homework
  2. dplyr
  3. tidyr
  4. Functions
  5. purrr

dplyr: select() and filter()

1. select()

Add or remove using select() helper functions.

  • starts_with()
  • ends_with()
  • contains()
  • matches()
  • num_range()
  • one_of()
  • all_of()
Code
bfi |>
  select(starts_with("C"))

2. filter()

  • Often times, when conducting research (experiments or otherwise), there are observations (people, specific trials, etc.) that you don’t want to include.

  • We can use filter() with logical statements to include only rows that match certain conditions

  • We can refer to both bare quoted columns and objects in the global environment

    • == or !=
    • < or <=
    • > or >=
    • %in%
    • all_of()
    • one_of()
    • !
    • | and &

tidyr: pivot_longer() and pivot_wider()

1. pivot_longer()

  • (Formerly gather()) Makes wide data long, based on a key
  • Core arguments:
    • data: the data, blank if piped
    • cols: columns to be made long, selected via select() calls
    • names_to: name(s) of key column(s) in new long data frame (string or string vector)
    • values_to: name of values in new long data frame (string)
    • names_sep: separator in column headers, if multiple keys
    • values_drop_na: drop missing cells (similar to na.rm = T)

Why would I make my data longer?

  • Main reason: Columns names sometimes contain data.
  • Example: Billboard data has time information in column names
Code
data(billboard)
str(billboard)
tibble [317 × 79] (S3: tbl_df/tbl/data.frame)
 $ artist      : chr [1:317] "2 Pac" "2Ge+her" "3 Doors Down" "3 Doors Down" ...
 $ track       : chr [1:317] "Baby Don't Cry (Keep..." "The Hardest Part Of ..." "Kryptonite" "Loser" ...
 $ date.entered: Date[1:317], format: "2000-02-26" "2000-09-02" ...
 $ wk1         : num [1:317] 87 91 81 76 57 51 97 84 59 76 ...
 $ wk2         : num [1:317] 82 87 70 76 34 39 97 62 53 76 ...
 $ wk3         : num [1:317] 72 92 68 72 25 34 96 51 38 74 ...
 $ wk4         : num [1:317] 77 NA 67 69 17 26 95 41 28 69 ...
 $ wk5         : num [1:317] 87 NA 66 67 17 26 100 38 21 68 ...
 $ wk6         : num [1:317] 94 NA 57 65 31 19 NA 35 18 67 ...
 $ wk7         : num [1:317] 99 NA 54 55 36 2 NA 35 16 61 ...
 $ wk8         : num [1:317] NA NA 53 59 49 2 NA 38 14 58 ...
 $ wk9         : num [1:317] NA NA 51 62 53 3 NA 38 12 57 ...
 $ wk10        : num [1:317] NA NA 51 61 57 6 NA 36 10 59 ...
 $ wk11        : num [1:317] NA NA 51 61 64 7 NA 37 9 66 ...
 $ wk12        : num [1:317] NA NA 51 59 70 22 NA 37 8 68 ...
 $ wk13        : num [1:317] NA NA 47 61 75 29 NA 38 6 61 ...
 $ wk14        : num [1:317] NA NA 44 66 76 36 NA 49 1 67 ...
 $ wk15        : num [1:317] NA NA 38 72 78 47 NA 61 2 59 ...
 $ wk16        : num [1:317] NA NA 28 76 85 67 NA 63 2 63 ...
 $ wk17        : num [1:317] NA NA 22 75 92 66 NA 62 2 67 ...
 $ wk18        : num [1:317] NA NA 18 67 96 84 NA 67 2 71 ...
 $ wk19        : num [1:317] NA NA 18 73 NA 93 NA 83 3 79 ...
 $ wk20        : num [1:317] NA NA 14 70 NA 94 NA 86 4 89 ...
 $ wk21        : num [1:317] NA NA 12 NA NA NA NA NA 5 NA ...
 $ wk22        : num [1:317] NA NA 7 NA NA NA NA NA 5 NA ...
 $ wk23        : num [1:317] NA NA 6 NA NA NA NA NA 6 NA ...
 $ wk24        : num [1:317] NA NA 6 NA NA NA NA NA 9 NA ...
 $ wk25        : num [1:317] NA NA 6 NA NA NA NA NA 13 NA ...
 $ wk26        : num [1:317] NA NA 5 NA NA NA NA NA 14 NA ...
 $ wk27        : num [1:317] NA NA 5 NA NA NA NA NA 16 NA ...
 $ wk28        : num [1:317] NA NA 4 NA NA NA NA NA 23 NA ...
 $ wk29        : num [1:317] NA NA 4 NA NA NA NA NA 22 NA ...
 $ wk30        : num [1:317] NA NA 4 NA NA NA NA NA 33 NA ...
 $ wk31        : num [1:317] NA NA 4 NA NA NA NA NA 36 NA ...
 $ wk32        : num [1:317] NA NA 3 NA NA NA NA NA 43 NA ...
 $ wk33        : num [1:317] NA NA 3 NA NA NA NA NA NA NA ...
 $ wk34        : num [1:317] NA NA 3 NA NA NA NA NA NA NA ...
 $ wk35        : num [1:317] NA NA 4 NA NA NA NA NA NA NA ...
 $ wk36        : num [1:317] NA NA 5 NA NA NA NA NA NA NA ...
 $ wk37        : num [1:317] NA NA 5 NA NA NA NA NA NA NA ...
 $ wk38        : num [1:317] NA NA 9 NA NA NA NA NA NA NA ...
 $ wk39        : num [1:317] NA NA 9 NA NA NA NA NA NA NA ...
 $ wk40        : num [1:317] NA NA 15 NA NA NA NA NA NA NA ...
 $ wk41        : num [1:317] NA NA 14 NA NA NA NA NA NA NA ...
 $ wk42        : num [1:317] NA NA 13 NA NA NA NA NA NA NA ...
 $ wk43        : num [1:317] NA NA 14 NA NA NA NA NA NA NA ...
 $ wk44        : num [1:317] NA NA 16 NA NA NA NA NA NA NA ...
 $ wk45        : num [1:317] NA NA 17 NA NA NA NA NA NA NA ...
 $ wk46        : num [1:317] NA NA 21 NA NA NA NA NA NA NA ...
 $ wk47        : num [1:317] NA NA 22 NA NA NA NA NA NA NA ...
 $ wk48        : num [1:317] NA NA 24 NA NA NA NA NA NA NA ...
 $ wk49        : num [1:317] NA NA 28 NA NA NA NA NA NA NA ...
 $ wk50        : num [1:317] NA NA 33 NA NA NA NA NA NA NA ...
 $ wk51        : num [1:317] NA NA 42 NA NA NA NA NA NA NA ...
 $ wk52        : num [1:317] NA NA 42 NA NA NA NA NA NA NA ...
 $ wk53        : num [1:317] NA NA 49 NA NA NA NA NA NA NA ...
 $ wk54        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk55        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk56        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk57        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk58        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk59        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk60        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk61        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk62        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk63        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk64        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk65        : num [1:317] NA NA NA NA NA NA NA NA NA NA ...
 $ wk66        : logi [1:317] NA NA NA NA NA NA ...
 $ wk67        : logi [1:317] NA NA NA NA NA NA ...
 $ wk68        : logi [1:317] NA NA NA NA NA NA ...
 $ wk69        : logi [1:317] NA NA NA NA NA NA ...
 $ wk70        : logi [1:317] NA NA NA NA NA NA ...
 $ wk71        : logi [1:317] NA NA NA NA NA NA ...
 $ wk72        : logi [1:317] NA NA NA NA NA NA ...
 $ wk73        : logi [1:317] NA NA NA NA NA NA ...
 $ wk74        : logi [1:317] NA NA NA NA NA NA ...
 $ wk75        : logi [1:317] NA NA NA NA NA NA ...
 $ wk76        : logi [1:317] NA NA NA NA NA NA ...
Code
billboard |>
  pivot_longer(
    cols = starts_with("wk"), 
    names_to = "week", 
    names_prefix = "wk",
    names_transform = as.numeric,
    values_to = "rank"
  )
  • This doesn’t just apply to longitudinal data. This is also important when thinking about iteration.
  • For example, if you variables can be grouped into different categories (covariates, IVs/predictors, DVs/outcomes, moderators, etc.), then your column names contain implicit data
  • The data below contain both time, variable, and category information
Code
# load the codebook
(codebook <- read_csv("week6-codebook.csv") |>
    mutate(old_name = str_to_lower(old_name)))
Code
old.names <- codebook$old_name # get old column names
new.names <- codebook$new_name # get new column names
soep <- read_csv("week6-data.csv") |>
    select(all_of(old.names))

Exercise

  • By pivoting our data longer, we can more easily extract information from the column names
  • Pivot the data below longer. Some hints:
    • don’t make the procedural or demographic variables long
    • split the column names you make long into four chunks:
      • “category”, “label”, “item_name”, “year”
    • drop NA values

Solution:

Code
soep_long <- soep |>
  setNames(new.names) |>
  pivot_longer(
    cols = c(-starts_with("Proc"), -starts_with("Dem"))
    , names_to = c("category",  "label",    "item_name",    "year")
    , names_pattern = "(.*)_(.*)_(.*)_(.*)"
    , values_to = "value"
    , values_drop_na = T
  ) |> mutate(year = as.numeric(year))
soep_long
  • Long format data are easier to clean (we’ll come back to this but we’ll create the cleaned data frame to use for merging practice)
Code
soep_big5 <- soep_long |>
  filter(category == "big5") |>
  mutate(value = mapvalues(value, seq(-8,0), rep(NA, 9))) |>
  drop_na(value) |>
  group_by(Proc_SID, label, year) |>
  summarize(value = mean(value)) |>
  ungroup()

soep_le <- soep_long |>
  filter(category == "le") |>
  mutate(value = mapvalues(value, seq(-8,1), c(rep(NA, 6), 0, NA, NA, 1))) |>
  drop_na(value) |>
  group_by(Proc_SID, label) |>
  summarize(value = sum(value)) |>
  ungroup()

soep_clean <- soep_big5 |>
  rename(trait = label, p_value = value) |>
  inner_join(
    soep_le |>
      rename(le = label, le_value = value)
  )

2. pivot_wider()

  • (Formerly spread()) Makes wide data long, based on a key
  • Core arguments:
    • data: the data, blank if piped
    • names_from: name(s) of key column(s) in new long data frame (string or string vector)
    • names_sep: separator in column headers, if multiple keys
    • names_glue: specify multiple or custom separators of multiple keys
    • values_from: name of values in new long data frame (string)
    • values_fn: function applied to data with duplicate labels

Why would I pivot wider?

  • Some analyses require wide format data
  • For example, SEM in lavaan in R requires that both indicators and time are wide format.
  • The code below uses the codebook to rename items according to a numbered format that isn’t specific to any trait
Code
big5 <- codebook |> filter(category == "big5")
soep_lavaan <- soep_long |>
  filter(category == "big5") |>
  mutate(item_name = mapvalues(item_name, big5$item_name, big5$lavaan_name, warn_missing = F))
soep_lavaan

Exercise

  • Change the soep_lavaan data frame to be in wide format using pivot_wider():
    • pull the names from two sources: item_name() and year
Code
soep_lavaan |>
  pivot_wider(
    names_from = c("item_name", "year")
    , values_from = "value"
  )

dplyr: `_join()’

The _join() Functions

  • Often we may need to pull different data from different sources
  • There are lots of reasons to need to do this
  • We don’t have time to get into all the use cases here, so we’ll talk about them in high level terms
  • We’ll focus on:
    • full_join()
    • inner_join()
    • left_join()
    • right_join()

3. full_join()

  • Most simply, we can put those back together keeping all observations.

  • Pro: sometimes we want to maintain missing data (i.e. some people are randomly missing variables and we don’t want to drop them completely)

  • Con: can leave you with lots of NAs

  • Join the codebook to the data below using full_join()

  • Look at the data. What’s going on here

Code
soep_long |>
  filter(!category == "big5") |> 
  full_join(
    # your code here
  ) |> 
    View()

Here’s the solution:

Code
soep_long |>
  filter(!category == "big5") |>
  full_join(codebook |> select(category, label, item_name, year, item_text)) 
  • note we have lots of missing data because the Big Five portions of the codebook were joined even though we removed that data

4. inner_join()

  • We can also keep all rows present in both data frames

  • Pro: Won’t add rows with missing values in key variables

  • Con: will drop observations that you want want for counts, correlations, etc.

  • Join the codebook to the data below using inner_join()

  • Look at the data. What’s going on here

Code
soep_long |>
  filter(!category == "big5")
  full_join(
    # your code here
  ) |> 
    View()
  • Note that filtering, renaming/selecting, and joining is a common workflow
Code
soep_long |>
  filter(category == "big5") |>
  select(Proc_SID, trait = label, item_name, year, p_value = value) |>
  inner_join(
    soep_long |>
    filter(category == "le") |>
    select(Proc_SID, le = label, year, le_value = value)
  )

5. left_join()

  • Or all rows present in the left (first) data frame, perhaps if it’s a subset of people with complete data
Code
soep_long |>
  filter(category == "big5") |>
  select(Proc_SID, trait = label, item_name, year, p_value = value) |>
  left_join(
    soep_long |>
    filter(category == "le") |>
    select(Proc_SID, le = label, year, le_value = value)
  )

6. right_join()

  • Or all rows present in the right (second) data frame, such as I do when I join a codebook with raw data
Code
soep_long |>
  filter(category == "big5") |>
  select(Proc_SID, trait = label, item_name, year, p_value = value) |>
  right_join(
    soep_long |>
    filter(category == "le") |>
    select(Proc_SID, le = label, year, le_value = value)
  )

Your Turn

In small groups, discuss what’s happening when you use full_join(), left_join(), right_join(), inner_join(), and anti_join() with the code below. Which is correct in this use case?

Code
soep_long |>
  filter(category == "big5") |>
  select(Proc_SID, trait = label, item_name, year, p_value = value) |>
  [x]_join(
    soep_long |>
    filter(category == "le") |>
    select(Proc_SID, le = label, year, le_value = value)
  )

dplyr: split-apply-combine

Bringing it all together: Split-Apply-Combine

  • Much of the power of dplyr functions lay in the split-apply-combine method

  • A given set of of data are:

    • split into smaller chunks
    • then a function or series of functions are applied to each chunk
    • and then the chunks are combined back together

3. group_by()

  • The group_by() function is the “split” of the method
  • It basically implicitly breaks the data set into chunks by whatever bare quoted column(s)/variable(s) are supplied as arguments.

4. mutate()

  • mutate() is one of your “apply” functions
  • When you use mutate(), the resulting data frame will have the same number of rows you started with
  • You are directly mutating the existing data frame, either modifying existing columns or creating new ones

5. summarize() / summarise()

  • summarize() is one of your “apply” functions
  • The resulting data frame will have the same number of rows as your grouping variable
  • You number of groups is 1 for ungrouped data frames

Exercise 1

  • Remember when I said that long format data are easier to clean. Let’s do that now.

Question 1:

  • Let’s start with the Big Five data:
  1. filter() out only Big Five rows
  2. mutate() each observation so that values less than one are changed to NA
  3. Remove any missing values using filter() or drop_na()
  4. Group (split) the data so that you have a “group” for each person x trait x year combination
  5. summarize() the values to get a composite score for each Big Five trait for each person in each year:

Solution

Remember when I said that long format data are easier to clean. Let’s do that now.

Code
soep_big5 <- soep_long |>
  filter(category == "big5") |>
  mutate(value = mapvalues(value, seq(-8,0), rep(NA, 9))) |>
  drop_na(value) |>
  group_by(Proc_SID, label, year) |>
  summarize(value = mean(value)) |>
  ungroup()
soep_big5

Question 2:

Now let’s take care of the life event data:
1. filter() out only life event rows
2. mutate() each observation so that

  • -2 = 0
  • 1 = 1
  • everything else is NA
  1. Remove any missing values using filter() or drop_na()
  2. Group (split) the data so that you have a “group” for each person x event combination
  3. summarize() the values to get a sum score for each event for each person across all years:

Solution

Code
soep_le <- soep_long |>
  filter(category == "le") |>
  mutate(value = mapvalues(value, seq(-8,1), c(rep(NA, 6), 0, NA, NA, 1))) |>
  drop_na(value) |>
  group_by(Proc_SID, label) |>
  summarize(value = sum(value)) |>
  ungroup()
soep_le

Question 3:

Just for practice, now make your Big Five data frame wide, leaving the time variable (year) long

Solution

Code
soep_big5 |>
  pivot_wider(
    names_from = "label"
    , values_from = "value"
  )

Question 4:

  • Now, let’s join the data frames back together.
  • Which join function do you think is most appropriate?
  • Hint: You will need to rename the label and value columns to reflect the category of the data

Solution

Code
soep_clean <- soep_big5 |>
  rename(trait = label, p_value = value) |>
  inner_join(
    soep_le |>
      rename(le = label, le_value = value)
  )
soep_clean

Functions

Functions

How to approach writing functions? (broad recipe)

  1. Experiment with performing the task outside of a function
    • Experiment with performing task with different sets of inputs
    • Often, you must revise this code, when an approach that worked outside a function does not work within a function
  2. Write the function
  3. Test the function
    • Try to “break” it
  4. Continual improvement. As you use the function, make continual improvements going back-and-forth between steps 1-3

Basics of writing functions

Three components of a function:

  1. Function name
    • Define a function using function() and give it a name using the assignment operator <-
  2. Function arguments (sometimes called “inputs”)
    • Inputs that the function takes; they go inside the parentheses of function()
      • Can be vectors, data frames, logical statements, strings, etc.
    • In the above hypothetical code, the function took three inputs arg1, arg2, arg3, but we could have written:
      • function(x, y, z) or function(Larry, Curly, Moe)
    • In the “function call,” you specify values to assign to these function arguments
  3. Function body
    • What the function does to the inputs
    • Function body goes inside the pair of curly brackets ({}) that follows function()
    • Above hypothetical function doesn’t do anything, but your function can return a value (covered in later section)

Exercise 2

Some common tasks when working with survey data:

  • Identify number of observations with NA values for a specific variable
  • Identify number of observations with negative values for a specific variable
  • Replace negative values with NA for a specific variable

num_negative() function

Task: Write function called num_negative()

  • Write a function that counts the number of observations with negative values for a specific variable
  • Apply this function to variables from dataframe df (created below)
  • Adapted from Ben Skinner’s Programming 1 R Workshop HERE
Code
# Sample dataframe `df` that contains some negative values
df

Steps:

Recommended steps:

  • Perform task outside of function
    • HINT: sum(data_frame_name$var_name<0)
  • Write function
  • Apply/test function on variables

Step 1: Perform task outside of function

Code
names(df) # identify variable names
[1] "id"     "age"    "sibage" "parage"
Code
df$age # print observations for a variable
  [1]  17  15 -97  13 -97  12 -99 -97  16  16 -98  20 -99  20  11  20  12  17
 [19]  19  17 -97 -99  12  13  11  15  20  14 -99  11  20 -98  11 -98  12  16
 [37]  12  18  12  19  12 -97  20  17  11  19  19  12 -98  11  15  18  15 -98
 [55]  15  19 -97  13 -98  16  13  12  16  19 -99  19 -98  13 -97  20  15  19
 [73]  15  12  18 -99  18 -98 -98 -98 -97  12  14  19 -97  11  20  18  14 -99
 [91]  15  20 -97  14  14  19  18  17  20  15
Code
#BaseR
sum(df$age<0) # count number of obs w/ negative values for variable "age"
[1] 27

Step 2: Write function

Code
num_missing <- function(x){
  sum(x<0)
}

Step 3: Apply function

Code
num_missing(df$age)
[1] 27
Code
num_missing(df$sibage)
[1] 22

Exercise 3:

In survey data, negative values often refer to reason for missing values:

  • E.g., -8 refers to “didn’t take survey”
  • E.g., -7 refers to “took survey, but didn’t answer this question”

num_missing() function

Task: Write function called num_negative()

  • Write a function that counts number of missing observations for a variable and allows you to specify which values are associated with missing for that variable. This function will take two arguments:
    • x: The variable (e.g., df$sibage)
    • miss_vals: Vector of values you want to associate with “missing” variable
      • Values to associate with missing for df$age: -97,-98,-99
      • Values to associate with missing for df$sibage: -97,-98,-99
      • Values to associate with missing for df$parage: -4,-7,-8

Steps

Recommended steps:

  • Perform task outside of function
    • HINT: sum(data_frame_name$var_name %in% c(-4,-5))
  • Write function
  • Apply/test function on variables

Step 1: Perform task outside of function

Code
sum(df$age %in% c(-97,-98,-99))
[1] 27

Step 2: Write function

Code
num_missing <- function(x, miss_vals){

  sum(x %in% miss_vals)
}

Step 3: Apply function

Code
num_missing(df$age,c(-97,-98,-99))
[1] 27
Code
num_missing(df$sibage,c(-97,-98,-99))
[1] 22
Code
num_missing(df$parage,c(-4,-7,-8))
[1] 17

purrr

purrr::map()

  • map() functions are the tidyverse alternative to for loops and chaotic lists with deep nesting structures
  • map() functions, unlike _apply() functions can take any number of inputs, which mimics nested for loops
  • map() functions can return any output type, including heterogeneous outputs (at least if you return it as a list)

Inputs

  • You control how many inputs using the following:
    • map(): one input, arguments are map(.x, .f)
    • map2(): two inputs, arguments are map2(.x, y., .f)
    • pmap(): any number of inputs, arguments are pmap(.l, .f)
      • Note the .l becuase this means we have to wrap inputs in a list()

Ouputs

  • You can also control the output of purrr::map():
    • map(): outputs a list
    • map_chr(): outputs a character vector
    • map_dbl(): outputs a numeric vector
    • map_lgl(): outputs a logical vector
    • map_int(): outputs a integer vector
    • map_vec(): outputs essentially any type of vector
  • Note that if one input combination fails, all will fail and nothing will be outputted

Error handling

  • Having everything fail because one thing went wrong is really frustrating
  • There are a number of functions in purrr to help with that:
    • possibly(.f, otherwise): returns whatever you ask it return with otherwise when a .f call fails
    • safely(.f): returns a list with the output, if successful, and errors, if unsuccessful
    • Others: see documentation.

List columns

  • One of the easiest ways to work with purrr is using list columns in nested data frames
  • You can create a nested data frame using tidyr::nest() or tibble() (where one column is a list itself)
Code
soep_clean |>
  group_by(trait, year, le) |>
  nest() |>
  ungroup()
  • You can then call map() within a mutate call to modify the list column or create new columns in your data frame
Code
tibble(
  x = c(1,2,3)
  , y = list(letters[1:5], letters[6:10], letters[11:15])
)

Exercise 4

Question 1:

  • Create a data frame called soep_nested that creates a list column of the data split by trait and life event.

Solution

Code
soep_nested <- soep_clean |>
  group_by(trait, le) |>
  nest() |>
  ungroup()

Question 2:

  • Using mutate(), create a new list column called model that runs the following function
Code
lmer_fun <- function(d){
  d <- d |> 
    mutate(wave = year - 2005) |>
    group_by(Proc_SID) |>
    filter(n() > 1)
  m <- lmer(p_value ~ wave + le_value + le_value:wave + (1 + wave | Proc_SID), data = d)
  return(m)
}

Solution

Code
soep_nested <- soep_nested |>
  mutate(model = map(data, lmer_fun))
soep_nested

Question 3:

  • Use the following function to extract the number of people we estimated slopes for in this model. Output the result as an integer to a new column called npeople
Code
nslopes_fun <- function(m) summary(m)$ngrps

Solution

Code
soep_nested <- soep_nested |>
  mutate(npeople = map_int(model, nslopes_fun))
soep_nested

Question 4:

  • Use the tidy() function from the broom.mixed package to extract the coefficients from the model and their confidence intervals. Save it to the column “tidy”
  • Hints:
    • Use the argument conf.int = T to get the confidence intervals
    • Additional arguments to the .f function called in map() can be just included as addition arguments (e.g., map(.x, .f, conf.int = T))

Solution

Code
soep_nested <- soep_nested |>
  mutate(tidy = map(model, broom.mixed::tidy, conf.int = T))

Question 5:

  • Let’s practice making a super simple table. Do the following:
  1. remove the data and model columns from the data frame
  2. unnest() the tidy column
  3. Keep only fixed effects (effect == "fixed")
  4. We only care about the interaction, so only keep the interaction term
  5. round the estimate, conf.low, and conf.high columns to 2 decimal places
  6. Keep the trait, le, estimate, conf.low, and conf.high columns only
  7. pivot_wider() by trait for estimate, conf.low, and conf.high

Solution

Code
soep_tab <- soep_nested |>
  select(-data, -model) |>
  unnest(tidy) |>
  filter(effect == "fixed" & grepl(":", term)) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 2))) |>
  select(trait, le, estimate, conf.low, conf.high) |>
  pivot_wider(
    names_from = "trait"
    , names_glue = "{trait}_{.value}"
    , values_from = c(estimate, conf.low, conf.high)
  )
soep_tab

Question 6:

Use the function below to get model predictions

Code
pred_fun <- function(m){
  d <- m@frame |>
      select(-p_value) |>
      distinct()
  bind_cols(d, pred = predict(m, newdata = d))
}

Solution

Code
soep_nested <- soep_nested |>
  mutate(pred = map(model, pred_fun))
soep_nested

Question 7:

  • Let’s practice making a super simple table. Do the following:
  1. remove the data, model, and tidy columns from the data frame
  2. unnest() the tidy column
  3. group_by() life event and nest() + ungroup()
  4. save this as soep_pred

Solution

Code
soep_pred <- soep_nested |>
  select(-data, -model, -tidy) |>
  unnest(pred) |>
  group_by(trait) |>
  nest() |>
  ungroup()
soep_pred

Question 8:

  • Use the following function to create a new column p that contains spaghetti plots
  • Note that the function takes two inputs!
Code
spag_plot_fun <- function(d, trait){
  set.seed(6)
  d |>
    group_by(le) |>
    nest() |>
    mutate(data = map(data, ~filter(., Proc_SID %in% sample(unique(.$Proc_SID), 100)))) |>
    unnest(data) |>
    ungroup() |>
    mutate(le_value = ifelse(le_value > 1, 1, le_value)) |>
    ggplot(aes(x = wave, y = pred)) + 
      geom_line(aes(group = Proc_SID, color = factor(le_value)), alpha = .3) + 
      geom_smooth(method = "lm", se = F, color = "darkblue") + 
      scale_color_manual(values = c("grey", "blue"), labels = c("No Event", "Event")) + 
      labs(x = "Wave", y = "Predicted Trait Levels", color = "Life Event", title = trait) + 
      facet_wrap(~le) + 
      theme_classic() + 
      theme(legend.position = c(.7, .1))
}

Solution

Code
soep_pred <- soep_pred |>
  mutate(p = map2(data, trait, spag_plot_fun))

soep_pred$p[[1]]

Wrap-Up

Wrap-Up

  • Today’s goal was to review the coding concepts we’ve used so far and ask you to apply them using a series of guided examples
  • The biggest takeaway I wanted you to have is chaining, or how you can use tidyverse functions in chains to accomplish a bunch of goals simultaneously
  • We cleaned, composited, and ran 50 models across thousands of people, including predictions and tables in less than 100 lines of code. Just doing the models, tidy(), and predict() parts of that alone would have been 150 lines of code and introduced huge opportunities for errors!

Appendix

Full Code

Data

Code
# load the codebook
(codebook <- read_csv("week6-codebook.csv") |>
    mutate(old_name = str_to_lower(old_name)))

old.names <- codebook$old_name # get old column names
new.names <- codebook$new_name # get new column names
soep <- read_csv("week6-data.csv") |>
    select(all_of(old.names))

Pivot Long

Code
soep_long <- soep |>
  setNames(new.names) |>
  pivot_longer(
    cols = c(-starts_with("Proc"), -starts_with("Dem"))
    , names_to = c("category",  "label",    "item_name",    "year")
    , names_pattern = "(.*)_(.*)_(.*)_(.*)"
    , values_to = "value"
    , values_drop_na = T
  ) |> mutate(year = as.numeric(year))
soep_long

Recode and Composite

Code
soep_big5 <- soep_long |>
  filter(category == "big5") |>
  mutate(value = mapvalues(value, seq(-8,0), rep(NA, 9))) |>
  drop_na(value) |>
  group_by(Proc_SID, label, year) |>
  summarize(value = mean(value)) |>
  ungroup()

soep_le <- soep_long |>
  filter(category == "le") |>
  mutate(value = mapvalues(value, seq(-8,1), c(rep(NA, 6), 0, NA, NA, 1))) |>
  drop_na(value) |>
  group_by(Proc_SID, label) |>
  summarize(value = sum(value)) |>
  ungroup()

soep_clean <- soep_big5 |>
  rename(trait = label, p_value = value) |>
  inner_join(
    soep_le |>
      rename(le = label, le_value = value)
  )

Models

Code
soep_nested <- soep_clean |>
  group_by(trait, le) |>
  nest() |>
  ungroup()

lmer_fun <- function(d){
  d <- d |> 
    mutate(wave = year - 2005) |>
    group_by(Proc_SID) |>
    filter(n() > 1)
  m <- lmer(p_value ~ wave + le_value + le_value:wave + (1 + wave | Proc_SID), data = d)
  return(m)
}

soep_nested <- soep_nested |>
  mutate(model = map(data, lmer_fun))
soep_nested

Results

Tables

Code
nslopes_fun <- function(m) summary(m)$ngrps

soep_nested <- soep_nested |>
  mutate(npeople = map_int(model, nslopes_fun))
soep_nested

soep_nested <- soep_nested |>
  mutate(tidy = map(model, broom.mixed::tidy, conf.int = T))

soep_tab <- soep_nested |>
  select(-data, -model) |>
  unnest(tidy) |>
  filter(effect == "fixed" & grepl(":", term)) |>
  mutate(across(c(estimate, conf.low, conf.high), \(x) round(x, 2))) |>
  select(trait, le, estimate, conf.low, conf.high) |>
  pivot_wider(
    names_from = "trait"
    , names_glue = "{trait}_{.value}"
    , values_from = c(estimate, conf.low, conf.high)
  )
soep_tab

Model Predictions

Code
pred_fun <- function(m){
  d <- m@frame |>
      select(-p_value) |>
      distinct()
  bind_cols(d, pred = predict(m, newdata = d))
}

soep_nested <- soep_nested |>
  mutate(pred = map(model, pred_fun))
soep_nested

soep_pred <- soep_nested |>
  select(-data, -model, -tidy) |>
  unnest(pred) |>
  group_by(trait) |>
  nest() |>
  ungroup()
soep_pred

spag_plot_fun <- function(d, trait){
  set.seed(6)
  d |>
    group_by(le) |>
    nest() |>
    mutate(data = map(data, ~filter(., Proc_SID %in% sample(unique(.$Proc_SID), 100)))) |>
    unnest(data) |>
    ungroup() |>
    mutate(le_value = ifelse(le_value > 1, 1, le_value)) |>
    ggplot(aes(x = wave, y = pred)) + 
      geom_line(aes(group = Proc_SID, color = factor(le_value)), alpha = .3) + 
      geom_smooth(method = "lm", se = F, color = "darkblue") + 
      scale_color_manual(values = c("grey", "blue"), labels = c("No Event", "Event")) + 
      labs(x = "Wave", y = "Predicted Trait Levels", color = "Life Event", title = trait) + 
      facet_wrap(~le) + 
      theme_classic() + 
      theme(legend.position = c(.7, .1))
}

soep_pred <- soep_pred |>
  mutate(p = map2(data, trait, spag_plot_fun))

soep_pred$p[[1]]