1. Введение

tidyverse — это набор пакетов:

  • dplyr, для преобразованиия данных
  • ggplot2, для визуализации
  • tidyr, для формата tidy data
  • readr, для чтения файлов в R
  • purrr, для функционального программирования
  • tibble, для работы с тибблами, современный вариант датафрейма

Полезно также знать:

  • readxl, для чтения .xls и .xlsx
  • jsonlite, для работы с JSON
  • rvest, для веб-скреппинга
  • lubridate, для работы с временем
  • tidytext, для работы с текстами и корпусами
  • broom, для перевода в tidy формат статистические модели
library(tidyverse)

2. tible

head(iris)
as_tibble(iris)
data_frame(id = 1:12,
           letters = month.name)

3. Чтение файлов: readr, readxl

library(readr)
df <- read_csv("https://goo.gl/v7nvho")
head(df)
df <- read_tsv("https://goo.gl/33r2Ut")
head(df)
df <- read_delim("https://goo.gl/33r2Ut", delim = "\t")
head(df)
library(readxl)
xlsx_example <- readxl_example("datasets.xlsx")
df <- read_excel(xlsx_example)
head(df)
excel_sheets(xlsx_example)
## [1] "iris"     "mtcars"   "chickwts" "quakes"
df <- read_excel(xlsx_example, sheet = "mtcars")
head(df)
rm(df)

4. dplyr

homo <- read_csv("http://goo.gl/Zjr9aF")
homo

The majority of examples in that presentation are based on Hau 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:

  • [s] duration (s.duration.ms)
  • vowel duration (vowel.duration.ms)
  • fundamental frequencies mean (F0) (average.f0.Hz)
  • fundamental frequencies range (f0.range.Hz)
  • percentage of homosexual impression (perceived.as.homo)
  • percentage of heterosexal impression (perceived.as.hetero)
  • speakers orientation (orientation)
  • speakers age (age)

4.1 dplyr::filter()

How many speakers are older than 28?

homo %>%
  filter(age > 28, s.duration.ms < 60)

%>% — конвеер (pipe) отправляет результат работы одной функции в другую.

sort(sqrt(abs(sin(1:22))), decreasing = TRUE)
##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>% 
  sin() %>% 
  abs() %>% 
  sqrt() %>% 
  sort(., decreasing = TRUE) # зачем здесь точка?
##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814

Конвееры в tidyverse пришли из пакета magrittr. Иногда они работают не корректно с функциями не из tidyverse.

4.2 dplyr::slice()

homo %>%
  slice(3:7)

4.3 dplyr::select()

homo %>%
  select(8:10)
homo %>%
  select(speaker:average.f0.Hz)
homo %>%
  select(-speaker)
homo %>%
  select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))
homo %>%
  select(speaker, age, s.duration.ms)

4.4 dplyr::arrange()

homo %>%
  arrange(orientation, desc(age))

4.5 dplyr::distinct()

homo %>%
  distinct(orientation)
homo %>%
  distinct(orientation, age > 20)

4.6 dplyr::mutate()

homo %>%
  mutate(f0.mn = average.f0.Hz - f0.range.Hz/2,
         f0.mx = (average.f0.Hz + f0.range.Hz/2))

4.7 dplyr::group_by(...) %>% summarise(...)

homo %>%
  summarise(min(age), mean(s.duration.ms))
homo %>%
  group_by(orientation) %>% 
  summarise(my_mean = mean(s.duration.ms))
homo %>%
  group_by(orientation) %>% 
  summarise(mean(s.duration.ms))
homo %>%
  group_by(orientation) %>% 
  summarise(mean_by_orientation = mean(s.duration.ms))

Если нужно посчитать количество вхождений, то можно использовать функцию n() в summarise() или же функцию count():

homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>% 
  count(orientation, age > 20)

4.8 dplyr::.._join()

languages <- data_frame(
  languages = c("Selkup", "French", "Chukchi", "Kashubian"),
  countries = c("Russia", "France", "Russia", "Poland"),
  iso = c("sel", "fra", "ckt", "pol")
  )
languages
country_population <- data_frame(
  countries = c("Russia", "Poland", "Finland"),
  population_mln = c(143, 38, 5))
country_population
inner_join(languages, country_population)
left_join(languages, country_population)
right_join(languages, country_population)
anti_join(languages, country_population)
anti_join(country_population, languages)
full_join(country_population, languages)

Существует достаточно забавный трюк, который позволяет использовать .._join() вместе с group_by() и summarise():

homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n()) %>% 
  left_join(homo)

5. tidyr package

  • Short format
df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short
  • Long format
  • Short format → Long format: tidyr::gather()
df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short
df.short %>% 
  gather(position, number, initial:final) ->
  df.long
df.long
  • Long format → Short format: tidyr::spread()
df.long %>% 
  spread(position, number) ->
  df.short
df.short