Intro to dplyr

Practical Computing and Data Science Tools

Announcements

  • Midterm 1 were released yesterday.
  • Lab 5 grades coming soon.

Agenda

  • dplyr verbs

Data Wrangling with dplyr

filter()

  • Subsets rows of data based on logical conditions related to one or more columns in the dataset.

Data

  • Consider our favorite toy dataset:
library(tidyverse)
pets <- tibble(
  names = c("Dude", "Pickle", "Kyle", "Nubs"),
  ages = c(6, 5, 3, 11),
  is_dog = c(TRUE, FALSE, FALSE, FALSE)
)
pets
# A tibble: 4 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Dude       6 TRUE  
2 Pickle     5 FALSE 
3 Kyle       3 FALSE 
4 Nubs      11 FALSE 

filter() ages greater than 5

filter(pets, ages > 5)
# A tibble: 2 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Dude      6 TRUE  
2 Nubs     11 FALSE 
  • Notice the pattern: filter(dataset, logical statement)
  • The first argument in filter(), and the other dplyr function we will learn, is the dataset we’d like to operate on.

filter() dogs with ages greater than 5

filter(pets, ages > 5, is_dog)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Dude      6 TRUE  

filter() dogs with ages greater than 5

filter(pets, ages > 5, is_dog)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Dude      6 TRUE  

We can equivalently write:

filter(pets, ages > 5 & is_dog)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Dude      6 TRUE  
  • In filter() the comma between logical statements is the same as &.

filter() for pets named Pickle

filter(pets, names == "Pickle")
# A tibble: 1 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Pickle     5 FALSE 

filter() for pets named Pickle or Dude

filter(pets, names %in% c("Pickle", "Dude"))
# A tibble: 2 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Dude       6 TRUE  
2 Pickle     5 FALSE 

Your turn!

filter() for non-dogs

# A tibble: 4 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Dude       6 TRUE  
2 Pickle     5 FALSE 
3 Kyle       3 FALSE 
4 Nubs      11 FALSE 
01:00

filter() for non-dogs

filter(pets, !is_dog)
# A tibble: 3 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Pickle     5 FALSE 
2 Kyle       3 FALSE 
3 Nubs      11 FALSE 

filter() for non-dogs

filter(pets, !is_dog)
# A tibble: 3 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Pickle     5 FALSE 
2 Kyle       3 FALSE 
3 Nubs      11 FALSE 
# or
filter(pets, is_dog == FALSE)
# A tibble: 3 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Pickle     5 FALSE 
2 Kyle       3 FALSE 
3 Nubs      11 FALSE 

filter() for non-dogs with age greater than 10

# A tibble: 4 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Dude       6 TRUE  
2 Pickle     5 FALSE 
3 Kyle       3 FALSE 
4 Nubs      11 FALSE 
01:00

filter() for non-dogs with age greater than 10

filter(pets, !is_dog & ages > 10)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 

filter() for non-dogs with age greater than 10

filter(pets, !is_dog & ages > 10)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 
# or 
filter(pets, !is_dog, ages > 10)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 

filter() for non-dogs with age greater than 10

filter(pets, !is_dog & ages > 10)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 
# or 
filter(pets, !is_dog, ages > 10)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 
# or 
filter(pets, is_dog == FALSE, ages > 10) # etc...
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 

slice()

slice()

There are five variations of slice() that we will learn in this course:

  • slice(): for getting rows or sets of rows by index,
  • slice_head(): for getting the first few (n) rows,
  • slice_tail(): for getting the last few (n) rows,
  • slice_min(): for getting the row with the minimum value of some column,
  • slice_max(): for getting the row with the maximum value of some column.

slice()

Get the first and third rows of pets

slice(pets, 1, 3)
# A tibble: 2 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Dude      6 TRUE  
2 Kyle      3 FALSE 

Get the first and third rows of pets

slice(pets, 1, 3)
# A tibble: 2 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Dude      6 TRUE  
2 Kyle      3 FALSE 
# or, equivalently
slice(pets, c(1,3))
# A tibble: 2 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Dude      6 TRUE  
2 Kyle      3 FALSE 

Get the first two rows of pets

slice(pets, 1, 2)
# A tibble: 2 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Dude       6 TRUE  
2 Pickle     5 FALSE 

Get the first two rows of pets

slice(pets, 1, 2)
# A tibble: 2 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Dude       6 TRUE  
2 Pickle     5 FALSE 
# or, equivalently
slice_head(pets, n = 2)
# A tibble: 2 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Dude       6 TRUE  
2 Pickle     5 FALSE 

Get the last row of pets

slice_tail(pets)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 
  • Note that in slice_tail() (and slice_head()) the default for n is 1.

Get the last row of pets

slice_tail(pets)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 
  • Note that in slice_tail() (and slice_head()) the default for n is 1.
slice_tail(pets, n = 1) # exactly the same as above
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 

Get the pet who is the youngest

slice_min(pets, ages)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Kyle      3 FALSE 

Get the pet who is the oldest

slice_max(pets, ages)
# A tibble: 1 × 3
  names  ages is_dog
  <chr> <dbl> <lgl> 
1 Nubs     11 FALSE 

Get the two youngest pets

slice_min(pets, ages, n = 2)
# A tibble: 2 × 3
  names   ages is_dog
  <chr>  <dbl> <lgl> 
1 Kyle       3 FALSE 
2 Pickle     5 FALSE 

arrange()

arrange()

  • arrange() orders rows based on values in one or more columns.
  • By default, arrange() puts smallest values at the top.
  • If you’d like largest values at the top, we can use the desc() function (named for descending order)
  • When we arrange() by more than one column, the following columns “break ties” in the prior columns.

arrange()

Consider a modifed pets dataset

more_pets <- tibble(
  names = c("Dude", "Pickle", "Kyle", "Nubs", "Marvin", "Figaro", "Slim"),
  ages = c(6, 5, 3, 11, 11, 3, 6),
  meals_per_day = c(2, 3, 3, 3, 1, 2, 2),
  is_dog = c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE)  
)
more_pets
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Dude       6             2 TRUE  
2 Pickle     5             3 FALSE 
3 Kyle       3             3 FALSE 
4 Nubs      11             3 FALSE 
5 Marvin    11             1 FALSE 
6 Figaro     3             2 FALSE 
7 Slim       6             2 TRUE  

arrange() more_pets by age

arrange(more_pets, ages)
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Kyle       3             3 FALSE 
2 Figaro     3             2 FALSE 
3 Pickle     5             3 FALSE 
4 Dude       6             2 TRUE  
5 Slim       6             2 TRUE  
6 Nubs      11             3 FALSE 
7 Marvin    11             1 FALSE 

arrange() more_pets by age, greatest first

arrange(more_pets, desc(ages))
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Nubs      11             3 FALSE 
2 Marvin    11             1 FALSE 
3 Dude       6             2 TRUE  
4 Slim       6             2 TRUE  
5 Pickle     5             3 FALSE 
6 Kyle       3             3 FALSE 
7 Figaro     3             2 FALSE 

arrange() more_pets by age and then meals per day

arrange(more_pets, ages, meals_per_day)
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Figaro     3             2 FALSE 
2 Kyle       3             3 FALSE 
3 Pickle     5             3 FALSE 
4 Dude       6             2 TRUE  
5 Slim       6             2 TRUE  
6 Marvin    11             1 FALSE 
7 Nubs      11             3 FALSE 

arrange() more_pets by is_dog

arrange(more_pets, is_dog)
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Pickle     5             3 FALSE 
2 Kyle       3             3 FALSE 
3 Nubs      11             3 FALSE 
4 Marvin    11             1 FALSE 
5 Figaro     3             2 FALSE 
6 Dude       6             2 TRUE  
7 Slim       6             2 TRUE  
  • When arrange()ing with a logical column, FALSE is before TRUE
  • We can switch this with desc()

arrange() more_pets by is_dog, TRUE first

arrange(more_pets, desc(is_dog))
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Dude       6             2 TRUE  
2 Slim       6             2 TRUE  
3 Pickle     5             3 FALSE 
4 Kyle       3             3 FALSE 
5 Nubs      11             3 FALSE 
6 Marvin    11             1 FALSE 
7 Figaro     3             2 FALSE 

arrange() more_pets by names

arrange(more_pets, names)
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Dude       6             2 TRUE  
2 Figaro     3             2 FALSE 
3 Kyle       3             3 FALSE 
4 Marvin    11             1 FALSE 
5 Nubs      11             3 FALSE 
6 Pickle     5             3 FALSE 
7 Slim       6             2 TRUE  
  • What happened here?
  • Alphabetical order!
  • What will happen if we use desc() on names?

arrange() more_pets by names, descending

arrange(more_pets, desc(names))
# A tibble: 7 × 4
  names   ages meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Slim       6             2 TRUE  
2 Pickle     5             3 FALSE 
3 Nubs      11             3 FALSE 
4 Marvin    11             1 FALSE 
5 Kyle       3             3 FALSE 
6 Figaro     3             2 FALSE 
7 Dude       6             2 TRUE  

select()

select()

  • select() selects a subset of columns in the dataset.

select() names and ages from more_pets

select(more_pets, names, ages)
# A tibble: 7 × 2
  names   ages
  <chr>  <dbl>
1 Dude       6
2 Pickle     5
3 Kyle       3
4 Nubs      11
5 Marvin    11
6 Figaro     3
7 Slim       6

remove meals_per_day from more_pets

select(more_pets, -meals_per_day, -ages)
# A tibble: 7 × 2
  names  is_dog
  <chr>  <lgl> 
1 Dude   TRUE  
2 Pickle FALSE 
3 Kyle   FALSE 
4 Nubs   FALSE 
5 Marvin FALSE 
6 Figaro FALSE 
7 Slim   TRUE  
  • NOTE: The book uses ! instead of - in select(). We prefer - and will not use !.

select() columns that end with “s”

select(more_pets, ends_with("s"))
# A tibble: 7 × 2
  names   ages
  <chr>  <dbl>
1 Dude       6
2 Pickle     5
3 Kyle       3
4 Nubs      11
5 Marvin    11
6 Figaro     3
7 Slim       6
  • NOTE: The ends_with() function can be quite useful when selecting columns.

rename()

rename()

  • rename() renames columns that are already in your dataset.

rename() meals_per_day to meals in the more_pets tibble

rename(more_pets, meals = meals_per_day)
# A tibble: 7 × 4
  names   ages meals is_dog
  <chr>  <dbl> <dbl> <lgl> 
1 Dude       6     2 TRUE  
2 Pickle     5     3 FALSE 
3 Kyle       3     3 FALSE 
4 Nubs      11     3 FALSE 
5 Marvin    11     1 FALSE 
6 Figaro     3     2 FALSE 
7 Slim       6     2 TRUE  
  • Note the syntax of new_name = old_name to rename the old_name column with new_name.

rename() meals_per_day to meals and ages to age in the more_pets tibble

rename(more_pets, 
       meals = meals_per_day,
       age = ages)
# A tibble: 7 × 4
  names    age meals is_dog
  <chr>  <dbl> <dbl> <lgl> 
1 Dude       6     2 TRUE  
2 Pickle     5     3 FALSE 
3 Kyle       3     3 FALSE 
4 Nubs      11     3 FALSE 
5 Marvin    11     1 FALSE 
6 Figaro     3     2 FALSE 
7 Slim       6     2 TRUE  

rename_with()

  • rename_with() allows us to use a function to rename columns. For example, the toupper() function. This function takes a character vector and makes it upper case:
toupper("trees")
[1] "TREES"

rename_with()

  • We can use toupper() to rename all the columns in more_pets to upper case:
rename_with(more_pets, toupper)
# A tibble: 7 × 4
  NAMES   AGES MEALS_PER_DAY IS_DOG
  <chr>  <dbl>         <dbl> <lgl> 
1 Dude       6             2 TRUE  
2 Pickle     5             3 FALSE 
3 Kyle       3             3 FALSE 
4 Nubs      11             3 FALSE 
5 Marvin    11             1 FALSE 
6 Figaro     3             2 FALSE 
7 Slim       6             2 TRUE  

rename_with()

  • Rather than renaming all the columns to upper case, we choose which columns we’d like to change to upper case:
rename_with(more_pets, toupper, c(names, ages))
# A tibble: 7 × 4
  NAMES   AGES meals_per_day is_dog
  <chr>  <dbl>         <dbl> <lgl> 
1 Dude       6             2 TRUE  
2 Pickle     5             3 FALSE 
3 Kyle       3             3 FALSE 
4 Nubs      11             3 FALSE 
5 Marvin    11             1 FALSE 
6 Figaro     3             2 FALSE 
7 Slim       6             2 TRUE  
  • NOTE: Careful here, we need the c() function in this case to choose which columns to rename with toupper()

Next time

  • More dplyr verbs
  • Writing a sentence with pipes