tidyverse — это набор пакетов:
Полезно также знать:
library(tidyverse)
head(iris)
as_tibble(iris)
data_frame(id = 1:12,
letters = month.name)
library(readr)
df <- read_csv("https://goo.gl/v7nvho")
head(df)
df <- read_tsv("https://goo.gl/33r2Ut")
head(df)
df <- read_delim("https://goo.gl/33r2Ut", delim = "\t")
head(df)
library(readxl)
xlsx_example <- readxl_example("datasets.xlsx")
df <- read_excel(xlsx_example)
head(df)
excel_sheets(xlsx_example)
## [1] "iris" "mtcars" "chickwts" "quakes"
df <- read_excel(xlsx_example, sheet = "mtcars")
head(df)
rm(df)
dplyr
homo <- read_csv("http://goo.gl/Zjr9aF")
homo
The majority of examples in that presentation are based on Hau 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:
dplyr::filter()
How many speakers are older than 28?
homo %>%
filter(age > 28, s.duration.ms < 60)
%>%
— конвеер (pipe) отправляет результат работы одной функции в другую.
sort(sqrt(abs(sin(1:22))), decreasing = TRUE)
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>%
sin() %>%
abs() %>%
sqrt() %>%
sort(., decreasing = TRUE) # зачем здесь точка?
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
Конвееры в tidyverse пришли из пакета magrittr. Иногда они работают не корректно с функциями не из tidyverse.
dplyr::slice()
homo %>%
slice(3:7)
dplyr::select()
homo %>%
select(8:10)
homo %>%
select(speaker:average.f0.Hz)
homo %>%
select(-speaker)
homo %>%
select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))
homo %>%
select(speaker, age, s.duration.ms)
dplyr::arrange()
homo %>%
arrange(orientation, desc(age))
dplyr::distinct()
homo %>%
distinct(orientation)
homo %>%
distinct(orientation, age > 20)
dplyr::mutate()
homo %>%
mutate(f0.mn = average.f0.Hz - f0.range.Hz/2,
f0.mx = (average.f0.Hz + f0.range.Hz/2))
dplyr::group_by(...) %>% summarise(...)
homo %>%
summarise(min(age), mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(my_mean = mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(mean_by_orientation = mean(s.duration.ms))
Если нужно посчитать количество вхождений, то можно использовать функцию n()
в summarise()
или же функцию count()
:
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>%
count(orientation, age > 20)
dplyr::.._join()
languages <- data_frame(
languages = c("Selkup", "French", "Chukchi", "Kashubian"),
countries = c("Russia", "France", "Russia", "Poland"),
iso = c("sel", "fra", "ckt", "pol")
)
languages
country_population <- data_frame(
countries = c("Russia", "Poland", "Finland"),
population_mln = c(143, 38, 5))
country_population
inner_join(languages, country_population)
left_join(languages, country_population)
right_join(languages, country_population)
anti_join(languages, country_population)
anti_join(country_population, languages)
full_join(country_population, languages)
Существует достаточно забавный трюк, который позволяет использовать .._join()
вместе с group_by()
и summarise()
:
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n()) %>%
left_join(homo)
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
tidyr::gather()
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
df.short %>%
gather(position, number, initial:final) ->
df.long
df.long
tidyr::spread()
df.long %>%
spread(position, number) ->
df.short
df.short