tidyverse — это набор пакетов:
Полезно также знать:
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
head(iris)
as_tibble(iris)
data_frame(id = 1:12,
letters = month.name)
library(readr)
df <- read_csv("https://goo.gl/v7nvho")
head(df)
df <- read_tsv("https://goo.gl/33r2Ut")
head(df)
df <- read_delim("https://goo.gl/33r2Ut", delim = "\t")
head(df)
library(readxl)
xlsx_example <- readxl_example("datasets.xlsx")
df <- read_excel(xlsx_example)
head(df)
excel_sheets(xlsx_example)
## [1] "iris" "mtcars" "chickwts" "quakes"
df <- read_excel(xlsx_example, sheet = "mtcars")
head(df)
rm(df)
dplyr
homo <- read_csv("http://goo.gl/Zjr9aF")
homo
The majority of examples in that presentation are based on Hau 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:
dplyr::filter()
How many speakers are older than 28?
homo %>%
filter(age > 28, s.duration.ms < 60)
%>%
— конвеер (pipe) отправляет результат работы одной функции в другую.
sort(sqrt(abs(sin(1:22))), decreasing = TRUE)
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>%
sin() %>%
abs() %>%
sqrt() %>%
sort(., decreasing = TRUE) # зачем здесь точка?
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
Конвееры в tidyverse пришли из пакета magrittr. Иногда они работают не корректно с функциями не из tidyverse.
dplyr::slice()
homo %>%
slice(3:7)
dplyr::select()
homo %>%
select(8:10)
homo %>%
select(speaker:average.f0.Hz)
homo %>%
select(-speaker)
homo %>%
select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))
homo %>%
select(speaker, age, s.duration.ms)
dplyr::arrange()
homo %>%
arrange(orientation, desc(age))
dplyr::distinct()
homo %>%
distinct(orientation)
homo %>%
distinct(orientation, age > 20)
dplyr::mutate()
homo %>%
mutate(f0.mn = average.f0.Hz - f0.range.Hz/2,
f0.mx = (average.f0.Hz + f0.range.Hz/2))
dplyr::group_by(...) %>% summarise(...)
homo %>%
summarise(min(age), mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(my_mean = mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(mean(s.duration.ms))
homo %>%
group_by(orientation) %>%
summarise(mean_by_orientation = mean(s.duration.ms))
Если нужно посчитать количество вхождений, то можно использовать функцию n()
в summarise()
или же функцию count()
:
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>%
count(orientation, age > 20)
dplyr::.._join()
languages <- data_frame(
languages = c("Selkup", "French", "Chukchi", "Kashubian"),
countries = c("Russia", "France", "Russia", "Poland"),
iso = c("sel", "fra", "ckt", "pol")
)
languages
country_population <- data_frame(
countries = c("Russia", "Poland", "Finland"),
population_mln = c(143, 38, 5))
country_population
inner_join(languages, country_population)
left_join(languages, country_population)
right_join(languages, country_population)
anti_join(languages, country_population)
anti_join(country_population, languages)
full_join(country_population, languages)
Существует достаточно забавный трюк, который позволяет использовать .._join()
вместе с group_by()
и summarise()
:
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n()) %>%
left_join(homo)
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
tidyr::gather()
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
df.short %>%
gather(position, number, initial:final) ->
df.long
df.long
tidyr::spread()
df.long %>%
spread(position, number) ->
df.short
df.short
In Anscombe, F. J. (1973). “Graphs in Statistical Analysis” was presented the next sets of data:
quartet <- read.csv("https://goo.gl/KuuzYy")
quartet
quartet %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 2)
In Matejka and Fitzmaurice (2017) “Same Stats, Different Graphs” was presented the next sets of data:
datasaurus <- read_tsv("https://goo.gl/gtaunr")
head(datasaurus)
datasaurus %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 1)
ggplot(data = homo, aes(s.duration.ms, vowel.duration.ms)) +
geom_point()
homo %>%
ggplot(aes(average.f0.Hz, age))+
geom_smooth(method = "lm")+
geom_point(aes(color = orientation))
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
color = orientation)) +
geom_point()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
shape = orientation)) +
geom_point(color = "green")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
size = age)) +
geom_point()
homo %>%
mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, fill = orientation)) +
geom_label()
homo %>%
mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, color = orientation)) +
geom_text()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
labs(title = "length of [s] vs. length of vowels",
subtitle = "based on 14 speakers of Cantonese",
caption = "data from [Hau 2007]")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
xlab("duration of [s] in ms")+
ylab("vowel duration in ms")
Lets use the frequency dictionary for Russian
freq <- read.csv("https://goo.gl/TlX7xW", sep = "\t")
freq %>%
arrange(desc(Freq.ipm.)) %>%
slice(1:200) %>%
ggplot(aes(Rank, Freq.ipm.)) +
geom_point() +
xlab("") +
ylab("ipm")
freq %>%
ggplot(aes(1:52138, Freq.ipm.))+
geom_point()+
xlab("")+
ylab("ipm")+
scale_y_log10()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
geom_point() +
geom_rug()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = mean(homo$vowel.duration.ms))+
geom_vline(xintercept = 60)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 2)+
geom_vline(xintercept = 60, size = 5)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 4)+
geom_vline(xintercept = 60, color = "blue")
Функция annotate
добавляет geom
к графику.
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
annotate(geom = "rect", xmin = 77, xmax = 79,
ymin = 117, ymax = 122, fill = "red", alpha = 0.2) +
annotate(geom = "text", x = 78, y = 125,
label = "Who is that?\n Outlier?")
There are two possible situations:
head(homo[, c(1, 9)])
head(homo[, c(1, 10)])
homo %>%
ggplot(aes(orientation)) +
geom_bar()
homo %>%
ggplot(aes(speaker, age)) +
geom_col()
homo %>%
ggplot(aes(speaker, age, fill = orientation)) +
geom_col()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()+
geom_point()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot() +
geom_jitter(width = 0.5)
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_violin() +
geom_jitter()
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
ggplot(aes(newvr, newvar, color = newvar))+
geom_jitter(width = 0.2)
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
group_by(newvar, newvr) %>%
summarise(number = n()) %>%
ggplot(aes(newvr, newvar, label = number))+
geom_point(aes(size = number, color = newvar))+
geom_text()+
scale_size(range = c(10, 30))+
guides(size = F)
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram()
How many histogram bins do we need?
nclass.Sturges(homo$s.duration.ms)
nclass.scott(homo$s.duration.ms)
nclass.FD(homo$s.duration.ms)
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(bins = nclass.FD(homo$s.duration.ms))
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(fill = "lightblue")
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density()
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density(color = "blue")
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density(fill = "lightblue")
homo %>%
ggplot(aes(s.duration.ms, fill = orientation)) +
geom_density()
homo %>%
ggplot(aes(s.duration.ms, fill = orientation)) +
geom_density(alpha = 0.2)
library(ggridges)
homo %>%
ggplot(aes(s.duration.ms, orientation, fill = orientation)) +
geom_density_ridges()
Фасетизация наиболее сильное оружие ggplot2
, позволяющее разбить данные по одному или нескольким переменным и нанести награфик получившиеся подгруппы.
ggplot2::facet_wrap()
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation)
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free")
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free_x")
ggplot2::facet_grid()
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(older_then_28~orientation, scales = "free_x")
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x")
Существует еще славный аргумент margins
.
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x", margins = TRUE)
Иногда, очень хорошо показывать все данные на каждом фасете:
homo %>%
ggplot(aes(speaker, s.duration.ms))+
# Add an additional geom without facetization variable!
geom_point(data = homo[,-9], aes(speaker, s.duration.ms), color = "grey") +
geom_point() +
facet_wrap(~orientation)+
theme_bw()