Week 5 data import

Author

Matt Crump

Published

February 27, 2023

Load libraries

library(tidyverse)     # includes readr & tibble
library(rio)           # for almost any data import/export
library(haven)         # for SPSS, Stata,and SAS files
library(readxl)        # for Excel files
library(googlesheets4) # for Google Sheets

Options for viewing data

data(cars) # make data from R package available in global environment
#View(cars) # view in editor pane
#print(cars) # print to console
#cars # print to console

# show columns as rows
glimpse(cars)
Rows: 50
Columns: 2
$ speed <dbl> 4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13…
$ dist  <dbl> 2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34…
#show first 10 lines of table
head(cars)
  speed dist
1     4    2
2     4   10
3     7    4
4     7   22
5     8   16
6     9   10
# print nicely to web
knitr::kable(head(cars))
speed dist
4 2
4 10
7 4
7 22
8 16
9 10

Produce summaries

summary(cars)
     speed           dist       
 Min.   : 4.0   Min.   :  2.00  
 1st Qu.:12.0   1st Qu.: 26.00  
 Median :15.0   Median : 36.00  
 Mean   :15.4   Mean   : 42.98  
 3rd Qu.:19.0   3rd Qu.: 56.00  
 Max.   :25.0   Max.   :120.00  
library(summarytools)
#view(dfSummary(starwars))

Importing data

Data was downloaded from link in textbook.

library(rio)

demo_tsv <- import(file = "data/demo.tsv")

glimpse(demo_tsv)
Rows: 6
Columns: 6
$ character <chr> "A", "B", "C", "D", "E", "F"
$ factor    <chr> "high", "low", "med", "high", "low", "med"
$ integer   <int> 1, 2, 3, 4, 5, 6
$ double    <dbl> 1.5, 2.5, 3.5, 4.5, 5.5, 6.5
$ logical   <lgl> TRUE, TRUE, FALSE, FALSE, NA, TRUE
$ date      <IDate> 2022-04-04, 2022-04-03, 2022-04-02, 2022-04-01, 2022-03-31, …
demo_tsv$factor <- as.factor(demo_tsv$factor)

glimpse(demo_tsv)
Rows: 6
Columns: 6
$ character <chr> "A", "B", "C", "D", "E", "F"
$ factor    <fct> high, low, med, high, low, med
$ integer   <int> 1, 2, 3, 4, 5, 6
$ double    <dbl> 1.5, 2.5, 3.5, 4.5, 5.5, 6.5
$ logical   <lgl> TRUE, TRUE, FALSE, FALSE, NA, TRUE
$ date      <IDate> 2022-04-04, 2022-04-03, 2022-04-02, 2022-04-01, 2022-03-31, …
demo_tsv$
demo_tsv$factor
NULL
demo_tsv  <- import("data/demo.tsv")  # tab-separated values
demo_csv  <- import("data/demo.csv")  # comma-separated values
demo_xls  <- import("data/demo.xlsx") # Excel format
demo_sav  <- import("data/demo.sav")  # SPSS format
gs4_deauth() # skip authorisation for public data

demo_gs4  <- googlesheets4::read_sheet(
  ss = "16dkq0YL0J7fyAwT1pdgj1bNNrheckAU_2-DKuuM6aGI"
)

gs4_get(ss = "16dkq0YL0J7fyAwT1pdgj1bNNrheckAU_2-DKuuM6aGI")
Spreadsheet name: demo
              ID: 16dkq0YL0J7fyAwT1pdgj1bNNrheckAU_2-DKuuM6aGI
          Locale: en_US
       Time zone: Europe/London
     # of sheets: 1

(Sheet name): (Nominal extent in rows x columns)
      Sheet1: 1000 x 26
demo <- readr::read_csv("data/demo.csv")

spec(demo)
cols(
  character = col_character(),
  factor = col_character(),
  integer = col_double(),
  double = col_double(),
  logical = col_logical(),
  date = col_date(format = "")
)
corrected_cols <- cols(
  character = col_character(),
  factor = col_factor(levels = c("low", "med", "high")),
  integer = col_integer(),
  double = col_double(),
  logical = col_logical(),
  date = col_date(format = "%Y-%m-%d")
)

demo <- readr::read_csv("data/demo.csv", col_types = corrected_cols)

glimpse(demo)
Rows: 6
Columns: 6
$ character <chr> "A", "B", "C", "D", "E", "F"
$ factor    <fct> high, low, med, high, low, med
$ integer   <int> 1, 2, 3, 4, 5, 6
$ double    <dbl> 1.5, 2.5, 3.5, 4.5, 5.5, 6.5
$ logical   <lgl> TRUE, TRUE, FALSE, FALSE, NA, TRUE
$ date      <date> 2022-04-04, 2022-04-03, 2022-04-02, 2022-04-01, 2022-03-31, …

Write your own data

avatar <- tibble(
  name = c("Katara", "Toph", "Sokka"),
  bends = c("water", "earth", NA),
  friendly = TRUE,
  numbers = c(1,2,3),
  more_numbers = 1:3,
  even_more = seq(1,3,1)
)

# print it
avatar
# A tibble: 3 × 6
  name   bends friendly numbers more_numbers even_more
  <chr>  <chr> <lgl>      <dbl>        <int>     <dbl>
1 Katara water TRUE           1            1         1
2 Toph   earth TRUE           2            2         2
3 Sokka  <NA>  TRUE           3            3         3
avatar_by_row <- tribble(
  ~name,    ~bends,  ~friendly, ~numbers,
  "Katara", "water", TRUE, 1L,
  "Toph",   "earth", TRUE, 2L,
  "Sokka",  NA,      TRUE, 3L
)

avatar_by_row
# A tibble: 3 × 4
  name   bends friendly numbers
  <chr>  <chr> <lgl>      <int>
1 Katara water TRUE           1
2 Toph   earth TRUE           2
3 Sokka  <NA>  TRUE           3

Export data

export(avatar, "avatar.csv")