1. Введение

tidyverse — это набор пакетов:

dplyr, для преобразованиия данных
ggplot2, для визуализации
tidyr, для формата tidy data
readr, для чтения файлов в R
purrr, для функционального программирования
tibble, для работы с тибблами, современный вариант датафрейма

Полезно также знать:

readxl, для чтения .xls и .xlsx
jsonlite, для работы с JSON
rvest, для веб-скреппинга
lubridate, для работы с временем
tidytext, для работы с текстами и корпусами
broom, для перевода в tidy формат статистические модели

library(tidyverse)

2. tible

head(iris)
as_tibble(iris)
data_frame(id = 1:12,
           letters = month.name)

3. Чтение файлов: readr, readxl

library(readr)
df <- read_csv("https://goo.gl/v7nvho")
head(df)

df <- read_tsv("https://goo.gl/33r2Ut")
head(df)

df <- read_delim("https://goo.gl/33r2Ut", delim = "\t")
head(df)

library(readxl)
xlsx_example <- readxl_example("datasets.xlsx")
df <- read_excel(xlsx_example)
head(df)

excel_sheets(xlsx_example)

## [1] "iris"     "mtcars"   "chickwts" "quakes"

df <- read_excel(xlsx_example, sheet = "mtcars")
head(df)

rm(df)

4. `dplyr`

homo <- read_csv("http://goo.gl/Zjr9aF")
homo

The majority of examples in that presentation are based on Hau 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:

[s] duration (s.duration.ms)
vowel duration (vowel.duration.ms)
fundamental frequencies mean (F0) (average.f0.Hz)
fundamental frequencies range (f0.range.Hz)
percentage of homosexual impression (perceived.as.homo)
percentage of heterosexal impression (perceived.as.hetero)
speakers orientation (orientation)
speakers age (age)

4.1 `dplyr::filter()`

How many speakers are older than 28?

homo %>%
  filter(age > 28, s.duration.ms < 60)

%>% — конвеер (pipe) отправляет результат работы одной функции в другую.

sort(sqrt(abs(sin(1:22))), decreasing = TRUE)

##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814

1:22 %>% 
  sin() %>% 
  abs() %>% 
  sqrt() %>% 
  sort(., decreasing = TRUE) # зачем здесь точка?

##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814

Конвееры в tidyverse пришли из пакета magrittr. Иногда они работают не корректно с функциями не из tidyverse.

4.2 `dplyr::slice()`

homo %>%
  slice(3:7)

4.3 `dplyr::select()`

homo %>%
  select(8:10)

homo %>%
  select(speaker:average.f0.Hz)

homo %>%
  select(-speaker)

homo %>%
  select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))

homo %>%
  select(speaker, age, s.duration.ms)

4.4 `dplyr::arrange()`

homo %>%
  arrange(orientation, desc(age))

4.5 `dplyr::distinct()`

homo %>%
  distinct(orientation)

homo %>%
  distinct(orientation, age > 20)

4.6 `dplyr::mutate()`

homo %>%
  mutate(f0.mn = average.f0.Hz - f0.range.Hz/2,
         f0.mx = (average.f0.Hz + f0.range.Hz/2))

4.7 `dplyr::group_by(...) %>% summarise(...)`

homo %>%
  summarise(min(age), mean(s.duration.ms))

homo %>%
  group_by(orientation) %>% 
  summarise(my_mean = mean(s.duration.ms))

homo %>%
  group_by(orientation) %>% 
  summarise(mean(s.duration.ms))

homo %>%
  group_by(orientation) %>% 
  summarise(mean_by_orientation = mean(s.duration.ms))

Если нужно посчитать количество вхождений, то можно использовать функцию n() в summarise() или же функцию count():

homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n())

homo %>% 
  count(orientation, age > 20)

4.8 `dplyr::.._join()`

languages <- data_frame(
  languages = c("Selkup", "French", "Chukchi", "Kashubian"),
  countries = c("Russia", "France", "Russia", "Poland"),
  iso = c("sel", "fra", "ckt", "pol")
  )
languages

country_population <- data_frame(
  countries = c("Russia", "Poland", "Finland"),
  population_mln = c(143, 38, 5))
country_population

inner_join(languages, country_population)

left_join(languages, country_population)

right_join(languages, country_population)

anti_join(languages, country_population)

anti_join(country_population, languages)

full_join(country_population, languages)

Существует достаточно забавный трюк, который позволяет использовать .._join() вместе с group_by() и summarise():

homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n())

homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n()) %>% 
  left_join(homo)

5. tidyr package

Short format

df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short

Long format

Short format → Long format: tidyr::gather()

df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short

df.short %>% 
  gather(position, number, initial:final) ->
  df.long
df.long

Long format → Short format: tidyr::spread()

df.long %>% 
  spread(position, number) ->
  df.short
df.short

tidyverse

Г. Мороз

1. Введение

2. tible

3. Чтение файлов: readr, readxl

4. dplyr

4.1 dplyr::filter()

4.2 dplyr::slice()

4.3 dplyr::select()

4.4 dplyr::arrange()

4.5 dplyr::distinct()

4.6 dplyr::mutate()

4.7 dplyr::group_by(...) %>% summarise(...)

4.8 dplyr::.._join()