Week 11: Data wrangling

Author

Matt Crump

Published

April 26, 2023

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
──
✔ ggplot2 3.4.1     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ stringr 1.5.0
✔ tidyr   1.3.0     ✔ forcats 1.0.0
✔ readr   2.1.4     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
budget <- read_csv("data/budget.csv", show_col_types = FALSE)

Select

# select single column by name
product_dat <- budget %>% 
  select(product) 

# select single column by number
product_dat <- budget %>% select(2) 

# select single column by number
product_dat <- budget %>% select(2,3) 
#print columns 2 and 3
budget %>% select(2,3)
# A tibble: 8 × 2
  product sales_2019
  <chr>        <dbl>
1 widgets       2129
2 gadgets        723
3 widgets       1123
4 gadgets       2022
5 widgets       -728
6 gadgets       -423
7 widgets        633
8 gadgets       1204
product_dat <- budget %>% select(2) 
product_dat
# A tibble: 8 × 1
  product
  <chr>  
1 widgets
2 gadgets
3 widgets
4 gadgets
5 widgets
6 gadgets
7 widgets
8 gadgets
product_dat <- budget %>% select(2,3) 
knitr::kable(product_dat)
product sales_2019
widgets 2129
gadgets 723
widgets 1123
gadgets 2022
widgets -728
gadgets -423
widgets 633
gadgets 1204
( product_dat <- budget %>% select(2,3) )
# A tibble: 8 × 2
  product sales_2019
  <chr>        <dbl>
1 widgets       2129
2 gadgets        723
3 widgets       1123
4 gadgets       2022
5 widgets       -728
6 gadgets       -423
7 widgets        633
8 gadgets       1204

colon notation

# select columns individually
sales2019 <- budget %>% select(region, product, sales_2019)

# select columns with colon
sales2019 <- budget %>% select(region:sales_2019)
sales2019
# A tibble: 8 × 3
  region product sales_2019
  <chr>  <chr>        <dbl>
1 North  widgets       2129
2 North  gadgets        723
3 South  widgets       1123
4 South  gadgets       2022
5 East   widgets       -728
6 East   gadgets       -423
7 West   widgets        633
8 West   gadgets       1204
# select columns with colon
years <- budget %>% select(3:7)
years <- budget %>% select(sales_2019:satisfaction_2020)

UNSELECT

# de-select individual columns
sales <- budget %>% select(-expenses_2019, -expenses_2020)


# de-select a range of columns
sales <- budget %>% select(-(expenses_2019:expenses_2020))

FILTER

# select all rows where region equals North
budget %>% filter(region == "North")
# A tibble: 2 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 North  gadgets        723         77          1037        1115 very h… very h…
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# select all rows where expenses_2020 were exactly equal to 200
budget %>% filter(expenses_2020 == 200)
# A tibble: 1 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 South  gadgets       2022       -945          -610         200 low     low    
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# select all rows where sales_2019 was more than 100
budget %>% filter(sales_2019 > 100)
# A tibble: 6 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 North  gadgets        723         77          1037        1115 very h… very h…
3 South  widgets       1123      -1450          1004         672 high    neutral
4 South  gadgets       2022       -945          -610         200 low     low    
5 West   widgets        633        790           783        -315 neutral neutral
6 West   gadgets       1204        426           433        -136 low     low    
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# everything but the North
budget %>% filter(region != "North")
# A tibble: 6 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 South  widgets       1123      -1450          1004         672 high    neutral
2 South  gadgets       2022       -945          -610         200 low     low    
3 East   widgets       -728        -51          -801        -342 very l… very l…
4 East   gadgets       -423       -354            94        2036 neutral high   
5 West   widgets        633        790           783        -315 neutral neutral
6 West   gadgets       1204        426           433        -136 low     low    
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# regions and products with profit in both 2019 and 2020
profit_both <- budget %>% 
  filter(
    sales_2019 > expenses_2019,
    sales_2020 > expenses_2020
  )

# the same as above, using & instead of a comma
profit_both <- budget %>% 
  filter(
    sales_2019 > expenses_2019 &
    sales_2020 > expenses_2020
  )

# regions and products with profit in 2019 or 2020
profit_either <- budget %>% 
  filter(
    sales_2019 > expenses_2019 |
    sales_2020 > expenses_2020
  )

# 2020 profit greater than 1000
profit_1000 <- budget %>%
  filter(sales_2020 - expenses_2020 > 1000)

in

# retain any rows where region is north or south, and where product equals widget
budget %>%
  filter(region %in% c("North", "South"),
         product == "widgets")
# A tibble: 2 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 South  widgets       1123      -1450          1004         672 high    neutral
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
# retain any rows where the region is not east or west, and where the product does not equal gadgets
budget %>%
  filter(!region %in% c("East", "West"),
         product != "gadgets")
# A tibble: 2 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 North  widgets       2129       -517           822        -897 high    very h…
2 South  widgets       1123      -1450          1004         672 high    neutral
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
a <- c(1,2,3,4,5)

6 %in% a
[1] FALSE
1 %in% a
[1] TRUE
if(1 %in% a == TRUE) {
  "yes"
}
[1] "yes"
if(6 %in% a == FALSE) {
  "yes"
}
[1] "yes"
letters
 [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s"
[20] "t" "u" "v" "w" "x" "y" "z"
LETTERS
 [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S"
[20] "T" "U" "V" "W" "X" "Y" "Z"
which(letters %in% "g", arr.ind = TRUE)
[1] 7

arrange

# arranging the table 
# first by product in alphabetical order
# then by "region" in reverse alphabetical order
budget %>%
  arrange(product, desc(region))
# A tibble: 8 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 West   gadgets       1204        426           433        -136 low     low    
2 South  gadgets       2022       -945          -610         200 low     low    
3 North  gadgets        723         77          1037        1115 very h… very h…
4 East   gadgets       -423       -354            94        2036 neutral high   
5 West   widgets        633        790           783        -315 neutral neutral
6 South  widgets       1123      -1450          1004         672 high    neutral
7 North  widgets       2129       -517           822        -897 high    very h…
8 East   widgets       -728        -51          -801        -342 very l… very l…
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020
budget %>%
  arrange(product, region)
# A tibble: 8 × 8
  region product sales_2019 sales_2020 expenses_2019 expenses_…¹ satis…² satis…³
  <chr>  <chr>        <dbl>      <dbl>         <dbl>       <dbl> <chr>   <chr>  
1 East   gadgets       -423       -354            94        2036 neutral high   
2 North  gadgets        723         77          1037        1115 very h… very h…
3 South  gadgets       2022       -945          -610         200 low     low    
4 West   gadgets       1204        426           433        -136 low     low    
5 East   widgets       -728        -51          -801        -342 very l… very l…
6 North  widgets       2129       -517           822        -897 high    very h…
7 South  widgets       1123      -1450          1004         672 high    neutral
8 West   widgets        633        790           783        -315 neutral neutral
# … with abbreviated variable names ¹​expenses_2020, ²​satisfaction_2019,
#   ³​satisfaction_2020