Data Tidying

Author

Matt Crump

Published

April 19, 2023

look at untidy data

untidy_data <- read_csv("data/untidy_data.csv", show_col_types = FALSE)

knitr::kable(untidy_data)
customer_id itemsprice_2018 itemsprice_2019 itemsprice_2020 totalprice_2018 totalprice_2019 totalprice_2020
1 2 (3.91) 8 (4.72) 10 (5.59) 7.82 37.76 55.90
2 1 (3.91) 6 (4.72) 1 (5.59) 3.91 28.32 5.59
3 4 (3.91) 5 (4.72) 5 (5.59) 15.64 23.60 27.95
4 10 (3.91) 1 (4.72) 3 (5.59) 39.10 4.72 16.77
5 3 (3.91) 9 (4.72) 8 (5.59) 11.73 42.48 44.72

look at tidy data

tidy_data <- read_csv("data/tidy_data.csv", show_col_types = FALSE)

knitr::kable(tidy_data)
customer_id year items price_per_item totalprice
1 2018 2 3.91 7.82
1 2019 8 4.72 37.76
1 2020 10 5.59 55.90
2 2018 1 3.91 3.91
2 2019 6 4.72 28.32
2 2020 1 5.59 5.59
3 2018 4 3.91 15.64
3 2019 5 4.72 23.60
3 2020 5 5.59 27.95
4 2018 10 3.91 39.10
4 2019 1 4.72 4.72
4 2020 3 5.59 16.77
5 2018 3 3.91 11.73
5 2019 9 4.72 42.48
5 2020 8 5.59 44.72

convert messy to tidy

tidy_data_2 <- read_csv(file = "data/untidy_data.csv",
                      show_col_types = FALSE) %>%
  pivot_longer(
    cols = itemsprice_2018:totalprice_2020,
    names_to = c("category", "year"),
    names_sep = "_", 
    values_to = "value", 
    names_transform = list(year = as.integer),
    values_transform = list(value = as.character) 
  ) %>%
  pivot_wider(
    id_cols = c(customer_id, year),
    names_from = category,
    values_from = value
  ) %>%
  separate(
    col = itemsprice,
    into = c("items", "price_per_item"),
    sep = " ", 
    remove = TRUE, 
    convert = TRUE
  ) %>%
  mutate(
    price_per_item = stringr::str_replace_all(
      string = price_per_item, 
      pattern = "[()]", 
      replacement = ""
    )
  ) %>%
  type_convert(
    trim_ws = TRUE
  )

── Column specification ────────────────────────────────────────────────────────
cols(
  price_per_item = col_double(),
  totalprice = col_double()
)