Week 8: data relations

Author

Matt Crump

Published

March 20, 2023

customers <- tibble(
  id = 1:5,
  city = c("Port Ellen", "Dufftown", NA, "Aberlour", "Tobermory"),
  postcode = c("PA42 7DU", "AB55 4DH", NA, "AB38 7RY", "PA75 6NR")
)

orders <- tibble(
  id = c(2, 3, 4, 4, 5, 5, 6, 6, 7),
  items = c(10, 18, 21, 23, 9, 11, 11, 12, 3)
)

what doers dplyr mutate do?

orders_B <- tibble(
  id = c(2, 3, 4, 4, 5, 5, 6, 6, 7),
  items = c(10, 18, 21, 23, 9, 11, 11, 12, 3)
)

# add a new column
orders_B %>%
  mutate(satisfaction = NA)
# A tibble: 9 × 3
     id items satisfaction
  <dbl> <dbl> <lgl>       
1     2    10 NA          
2     3    18 NA          
3     4    21 NA          
4     4    23 NA          
5     5     9 NA          
6     5    11 NA          
7     6    11 NA          
8     6    12 NA          
9     7     3 NA          
# add mutiplie new column
orders_B %>%
  mutate(satisfaction = NA,
         numbers = 1:9)
# A tibble: 9 × 4
     id items satisfaction numbers
  <dbl> <dbl> <lgl>          <int>
1     2    10 NA                 1
2     3    18 NA                 2
3     4    21 NA                 3
4     4    23 NA                 4
5     5     9 NA                 5
6     5    11 NA                 6
7     6    11 NA                 7
8     6    12 NA                 8
9     7     3 NA                 9
# assign back to tibble
orders_B <- orders_B %>%
              mutate(satisfaction = NA,
                     numbers = 1:9)


orders_B %>%
  mutate(numbers_as_strings = as.character(numbers))
# A tibble: 9 × 5
     id items satisfaction numbers numbers_as_strings
  <dbl> <dbl> <lgl>          <int> <chr>             
1     2    10 NA                 1 1                 
2     3    18 NA                 2 2                 
3     4    21 NA                 3 3                 
4     4    23 NA                 4 4                 
5     5     9 NA                 5 5                 
6     5    11 NA                 6 6                 
7     6    11 NA                 7 7                 
8     6    12 NA                 8 8                 
9     7     3 NA                 9 9                 
orders_B %>%
  mutate(items_minus_id = items - id)
# A tibble: 9 × 5
     id items satisfaction numbers items_minus_id
  <dbl> <dbl> <lgl>          <int>          <dbl>
1     2    10 NA                 1              8
2     3    18 NA                 2             15
3     4    21 NA                 3             17
4     4    23 NA                 4             19
5     5     9 NA                 5              4
6     5    11 NA                 6              6
7     6    11 NA                 7              5
8     6    12 NA                 8              6
9     7     3 NA                 9             -4

Dplyr filter

full_data <- full_join(customers, orders, by = "id")
Warning in full_join(customers, orders, by = "id"): Each row in `x` is expected to match at most 1 row in `y`.
ℹ Row 4 of `x` matches multiple rows.
ℹ If multiple matches are expected, set `multiple = "all"` to silence this
  warning.
full_data
# A tibble: 10 × 4
      id city       postcode items
   <dbl> <chr>      <chr>    <dbl>
 1     1 Port Ellen PA42 7DU    NA
 2     2 Dufftown   AB55 4DH    10
 3     3 <NA>       <NA>        18
 4     4 Aberlour   AB38 7RY    21
 5     4 Aberlour   AB38 7RY    23
 6     5 Tobermory  PA75 6NR     9
 7     5 Tobermory  PA75 6NR    11
 8     6 <NA>       <NA>        11
 9     6 <NA>       <NA>        12
10     7 <NA>       <NA>         3
full_data %>%
  filter(items > 10)
# A tibble: 6 × 4
     id city      postcode items
  <dbl> <chr>     <chr>    <dbl>
1     3 <NA>      <NA>        18
2     4 Aberlour  AB38 7RY    21
3     4 Aberlour  AB38 7RY    23
4     5 Tobermory PA75 6NR    11
5     6 <NA>      <NA>        11
6     6 <NA>      <NA>        12
full_data %>%
  filter(postcode == "AB38 7RY")
# A tibble: 2 × 4
     id city     postcode items
  <dbl> <chr>    <chr>    <dbl>
1     4 Aberlour AB38 7RY    21
2     4 Aberlour AB38 7RY    23
full_data %>%
  filter(postcode == "AB38 7RY" & items > 21)
# A tibble: 1 × 4
     id city     postcode items
  <dbl> <chr>    <chr>    <dbl>
1     4 Aberlour AB38 7RY    23
full_data %>%
  filter(postcode == "AB38 7RY",
         items > 21)
# A tibble: 1 × 4
     id city     postcode items
  <dbl> <chr>    <chr>    <dbl>
1     4 Aberlour AB38 7RY    23
full_data %>%
  filter(city != "Aberlour")
# A tibble: 4 × 4
     id city       postcode items
  <dbl> <chr>      <chr>    <dbl>
1     1 Port Ellen PA42 7DU    NA
2     2 Dufftown   AB55 4DH    10
3     5 Tobermory  PA75 6NR     9
4     5 Tobermory  PA75 6NR    11
full_data %>%
  filter( is.na(city) == FALSE,
          is.na(items) == FALSE)
# A tibble: 5 × 4
     id city      postcode items
  <dbl> <chr>     <chr>    <dbl>
1     2 Dufftown  AB55 4DH    10
2     4 Aberlour  AB38 7RY    21
3     4 Aberlour  AB38 7RY    23
4     5 Tobermory PA75 6NR     9
5     5 Tobermory PA75 6NR    11
full_data %>%
  filter( is.na(city) == TRUE | is.na(items) == TRUE)
# A tibble: 5 × 4
     id city       postcode items
  <dbl> <chr>      <chr>    <dbl>
1     1 Port Ellen PA42 7DU    NA
2     3 <NA>       <NA>        18
3     6 <NA>       <NA>        11
4     6 <NA>       <NA>        12
5     7 <NA>       <NA>         3