tidyverse — это набор пакетов:
Полезно также знать:
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
head(iris)
as_tibble(iris)
data_frame(id = 1:12,
letters = month.name)
library(readr)
df <- read_csv("https://goo.gl/v7nvho")
head(df)
Gender <chr> | Age <int> | AgeGroup <chr> | Education <chr> | City <chr> | SubjectCode <chr> | Score <chr> | GivenScore <int> | Stimulus <chr> | Prefix <chr> | |
---|---|---|---|---|---|---|---|---|---|---|
female | 16 | child | school | Izhevsk | AA | A | 5 | utochnit | u | |
female | 16 | child | school | Izhevsk | AA | A | 5 | ozhestochit | o | |
female | 16 | child | school | Izhevsk | AA | E | 1 | ovneshnit | o | |
female | 16 | child | school | Izhevsk | AA | E | 1 | oblusit | o | |
female | 16 | child | school | Izhevsk | AA | A | 5 | osvezhit | o | |
female | 16 | child | school | Izhevsk | AA | A | 5 | oschastlivit | o |
df <- read_tsv("https://goo.gl/33r2Ut")
head(df)
speaker <chr> | year <int> | gender <chr> | cons <int> | inn <int> | total <int> |
---|---|---|---|---|---|
ait1954 | 1954 | f | 222 | 254 | 476 |
anp1929 | 1929 | m | 15 | 16 | 31 |
ans1925 | 1925 | f | 788 | 379 | 1167 |
avch1930 | 1930 | f | 43 | 6 | 49 |
avi1958 | 1958 | f | 1 | 31 | 32 |
avm1922 | 1922 | f | 778 | 235 | 1013 |
df <- read_delim("https://goo.gl/33r2Ut", delim = "\t")
head(df)
speaker <chr> | year <int> | gender <chr> | cons <int> | inn <int> | total <int> |
---|---|---|---|---|---|
ait1954 | 1954 | f | 222 | 254 | 476 |
anp1929 | 1929 | m | 15 | 16 | 31 |
ans1925 | 1925 | f | 788 | 379 | 1167 |
avch1930 | 1930 | f | 43 | 6 | 49 |
avi1958 | 1958 | f | 1 | 31 | 32 |
avm1922 | 1922 | f | 778 | 235 | 1013 |
library(readxl)
xlsx_example <- readxl_example("datasets.xlsx")
df <- read_excel(xlsx_example)
head(df)
Sepal.Length <dbl> | Sepal.Width <dbl> | Petal.Length <dbl> | Petal.Width <dbl> | Species <chr> |
---|---|---|---|---|
5.1 | 3.5 | 1.4 | 0.2 | setosa |
4.9 | 3.0 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.3 | 0.2 | setosa |
4.6 | 3.1 | 1.5 | 0.2 | setosa |
5.0 | 3.6 | 1.4 | 0.2 | setosa |
5.4 | 3.9 | 1.7 | 0.4 | setosa |
excel_sheets(xlsx_example)
## [1] "iris" "mtcars" "chickwts" "quakes"
df <- read_excel(xlsx_example, sheet = "mtcars")
head(df)
mpg <dbl> | cyl <dbl> | disp <dbl> | hp <dbl> | drat <dbl> | wt <dbl> | qsec <dbl> | vs <dbl> | am <dbl> | gear <dbl> | |
---|---|---|---|---|---|---|---|---|---|---|
21.0 | 6 | 160 | 110 | 3.90 | 2.620 | 16.46 | 0 | 1 | 4 | |
21.0 | 6 | 160 | 110 | 3.90 | 2.875 | 17.02 | 0 | 1 | 4 | |
22.8 | 4 | 108 | 93 | 3.85 | 2.320 | 18.61 | 1 | 1 | 4 | |
21.4 | 6 | 258 | 110 | 3.08 | 3.215 | 19.44 | 1 | 0 | 3 | |
18.7 | 8 | 360 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | |
18.1 | 6 | 225 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 |
rm(df)
dplyr
homo <- read_csv("http://goo.gl/Zjr9aF")
homo
speaker <chr> | s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | perceived.as.homo <int> | perceived.as.hetero <int> | |
---|---|---|---|---|---|---|---|
A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | |
B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | |
C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | |
D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | |
E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | |
F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | |
G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | |
H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | |
I | 60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | |
J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 |
The majority of examples in that presentation are based on Hau 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:
dplyr::filter()
How many speakers are older than 28?
homo %>%
filter(age > 28, s.duration.ms < 60)
speaker <chr> | s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | perceived.as.homo <int> | perceived.as.hetero <int> | |
---|---|---|---|---|---|---|---|
C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | |
J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | |
N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 |
%>%
— конвеер (pipe) отправляет результат работы одной функции в другую.
sort(sqrt(abs(sin(1:22))), decreasing = TRUE)
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>%
sin() %>%
abs() %>%
sqrt() %>%
sort(., decreasing = TRUE) # зачем здесь точка?
## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
Конвееры в tidyverse пришли из пакета magrittr. Иногда они работают не корректно с функциями не из tidyverse.
dplyr::slice()
homo %>%
slice(3:7)
speaker <chr> | s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | perceived.as.homo <int> | perceived.as.hetero <int> | |
---|---|---|---|---|---|---|---|
C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | |
D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | |
E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | |
F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | |
G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 |
dplyr::select()
homo %>%
select(8:10)
perceived.as.homo.percent <dbl> | orientation <chr> | age <int> | ||
---|---|---|---|---|
0.28 | hetero | 30 | ||
0.80 | hetero | 19 | ||
0.36 | homo | 29 | ||
0.60 | homo | 36 | ||
0.40 | homo | 27 | ||
0.68 | homo | 33 | ||
0.80 | hetero | 28 | ||
0.84 | hetero | 22 | ||
0.80 | homo | 22 | ||
0.32 | homo | 40 |
homo %>%
select(speaker:average.f0.Hz)
speaker <chr> | s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | |
---|---|---|---|---|
A | 61.40 | 112.60 | 119.51 | |
B | 63.90 | 126.49 | 100.29 | |
C | 55.08 | 126.81 | 114.90 | |
D | 78.11 | 119.17 | 126.61 | |
E | 64.71 | 93.68 | 130.76 | |
F | 67.00 | 127.87 | 150.79 | |
G | 65.39 | 147.52 | 128.96 | |
H | 62.46 | 120.13 | 105.26 | |
I | 60.45 | 140.44 | 109.86 | |
J | 59.59 | 121.01 | 123.90 |
homo %>%
select(-speaker)
s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | perceived.as.homo <int> | perceived.as.hetero <int> | |
---|---|---|---|---|---|---|
61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | |
63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | |
55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | |
78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | |
64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | |
67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | |
65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | |
62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | |
60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | |
59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 |
homo %>%
select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))
s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | orientation <chr> | age <int> |
---|---|---|---|---|---|
61.40 | 112.60 | 119.51 | 52.5 | hetero | 30 |
63.90 | 126.49 | 100.29 | 114.0 | hetero | 19 |
55.08 | 126.81 | 114.90 | 103.2 | homo | 29 |
78.11 | 119.17 | 126.61 | 58.8 | homo | 36 |
64.71 | 93.68 | 130.76 | 37.4 | homo | 27 |
67.00 | 127.87 | 150.79 | 42.0 | homo | 33 |
65.39 | 147.52 | 128.96 | 118.2 | hetero | 28 |
62.46 | 120.13 | 105.26 | 55.7 | hetero | 22 |
60.45 | 140.44 | 109.86 | 96.4 | homo | 22 |
59.59 | 121.01 | 123.90 | 111.7 | homo | 40 |
homo %>%
select(speaker, age, s.duration.ms)
speaker <chr> | age <int> | s.duration.ms <dbl> | ||
---|---|---|---|---|
A | 30 | 61.40 | ||
B | 19 | 63.90 | ||
C | 29 | 55.08 | ||
D | 36 | 78.11 | ||
E | 27 | 64.71 | ||
F | 33 | 67.00 | ||
G | 28 | 65.39 | ||
H | 22 | 62.46 | ||
I | 22 | 60.45 | ||
J | 40 | 59.59 |
dplyr::arrange()
homo %>%
arrange(orientation, desc(age))
speaker <chr> | s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | perceived.as.homo <int> | perceived.as.hetero <int> | |
---|---|---|---|---|---|---|---|
A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | |
N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | |
G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | |
L | 53.31 | 112.05 | 146.20 | 57.8 | 8 | 17 | |
H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | |
M | 45.13 | 133.74 | 155.34 | 100.5 | 9 | 16 | |
B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | |
J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | |
D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | |
F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 |
dplyr::distinct()
homo %>%
distinct(orientation)
orientation <chr> | ||||
---|---|---|---|---|
hetero | ||||
homo |
homo %>%
distinct(orientation, age > 20)
orientation <chr> | age > 20 <lgl> | |||
---|---|---|---|---|
hetero | TRUE | |||
hetero | FALSE | |||
homo | TRUE |
dplyr::mutate()
homo %>%
mutate(f0.mn = average.f0.Hz - f0.range.Hz/2,
f0.mx = (average.f0.Hz + f0.range.Hz/2))
speaker <chr> | s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | perceived.as.homo <int> | perceived.as.hetero <int> | |
---|---|---|---|---|---|---|---|
A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | |
B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | |
C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | |
D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | |
E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | |
F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | |
G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | |
H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | |
I | 60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | |
J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 |
dplyr::group_by(...) %>% summarise(...)
homo %>%
summarise(min(age), mean(s.duration.ms))
min(age) <dbl> | mean(s.duration.ms) <dbl> | |||
---|---|---|---|---|
19 | 61.22429 |
homo %>%
group_by(orientation) %>%
summarise(my_mean = mean(s.duration.ms))
orientation <chr> | my_mean <dbl> | |||
---|---|---|---|---|
hetero | 58.46571 | |||
homo | 63.98286 |
homo %>%
group_by(orientation) %>%
summarise(mean(s.duration.ms))
orientation <chr> | mean(s.duration.ms) <dbl> | |||
---|---|---|---|---|
hetero | 58.46571 | |||
homo | 63.98286 |
homo %>%
group_by(orientation) %>%
summarise(mean_by_orientation = mean(s.duration.ms))
orientation <chr> | mean_by_orientation <dbl> | |||
---|---|---|---|---|
hetero | 58.46571 | |||
homo | 63.98286 |
Если нужно посчитать количество вхождений, то можно использовать функцию n()
в summarise()
или же функцию count()
:
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
orientation <chr> | age > 20 <lgl> | my_mean <dbl> | n_observations <int> | |
---|---|---|---|---|
hetero | FALSE | 54.51500 | 2 | |
hetero | TRUE | 60.04600 | 5 | |
homo | TRUE | 63.98286 | 7 |
homo %>%
count(orientation, age > 20)
orientation <chr> | age > 20 <lgl> | n <int> | ||
---|---|---|---|---|
hetero | FALSE | 2 | ||
hetero | TRUE | 5 | ||
homo | TRUE | 7 |
dplyr::.._join()
languages <- data_frame(
languages = c("Selkup", "French", "Chukchi", "Kashubian"),
countries = c("Russia", "France", "Russia", "Poland"),
iso = c("sel", "fra", "ckt", "pol")
)
languages
languages <chr> | countries <chr> | iso <chr> | ||
---|---|---|---|---|
Selkup | Russia | sel | ||
French | France | fra | ||
Chukchi | Russia | ckt | ||
Kashubian | Poland | pol |
country_population <- data_frame(
countries = c("Russia", "Poland", "Finland"),
population_mln = c(143, 38, 5))
country_population
countries <chr> | population_mln <dbl> | |||
---|---|---|---|---|
Russia | 143 | |||
Poland | 38 | |||
Finland | 5 |
inner_join(languages, country_population)
languages <chr> | countries <chr> | iso <chr> | population_mln <dbl> | |
---|---|---|---|---|
Selkup | Russia | sel | 143 | |
Chukchi | Russia | ckt | 143 | |
Kashubian | Poland | pol | 38 |
left_join(languages, country_population)
languages <chr> | countries <chr> | iso <chr> | population_mln <dbl> | |
---|---|---|---|---|
Selkup | Russia | sel | 143 | |
French | France | fra | NA | |
Chukchi | Russia | ckt | 143 | |
Kashubian | Poland | pol | 38 |
right_join(languages, country_population)
languages <chr> | countries <chr> | iso <chr> | population_mln <dbl> | |
---|---|---|---|---|
Selkup | Russia | sel | 143 | |
Chukchi | Russia | ckt | 143 | |
Kashubian | Poland | pol | 38 | |
NA | Finland | NA | 5 |
anti_join(languages, country_population)
languages <chr> | countries <chr> | iso <chr> | ||
---|---|---|---|---|
French | France | fra |
anti_join(country_population, languages)
countries <chr> | population_mln <dbl> | |||
---|---|---|---|---|
Finland | 5 |
full_join(country_population, languages)
countries <chr> | population_mln <dbl> | languages <chr> | iso <chr> | |
---|---|---|---|---|
Russia | 143 | Selkup | sel | |
Russia | 143 | Chukchi | ckt | |
Poland | 38 | Kashubian | pol | |
Finland | 5 | NA | NA | |
France | NA | French | fra |
Существует достаточно забавный трюк, который позволяет использовать .._join()
вместе с group_by()
и summarise()
:
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n())
orientation <chr> | age > 20 <lgl> | my_mean <dbl> | n_observations <int> | |
---|---|---|---|---|
hetero | FALSE | 54.51500 | 2 | |
hetero | TRUE | 60.04600 | 5 | |
homo | TRUE | 63.98286 | 7 |
homo %>%
group_by(orientation, age > 20) %>%
summarise(my_mean = mean(s.duration.ms), n_observations = n()) %>%
left_join(homo)
orientation <chr> | age > 20 <lgl> | my_mean <dbl> | n_observations <int> | speaker <chr> | s.duration.ms <dbl> | vowel.duration.ms <dbl> | average.f0.Hz <dbl> | f0.range.Hz <dbl> | |
---|---|---|---|---|---|---|---|---|---|
hetero | FALSE | 54.51500 | 2 | A | 61.40 | 112.60 | 119.51 | 52.5 | |
hetero | FALSE | 54.51500 | 2 | B | 63.90 | 126.49 | 100.29 | 114.0 | |
hetero | FALSE | 54.51500 | 2 | G | 65.39 | 147.52 | 128.96 | 118.2 | |
hetero | FALSE | 54.51500 | 2 | H | 62.46 | 120.13 | 105.26 | 55.7 | |
hetero | FALSE | 54.51500 | 2 | L | 53.31 | 112.05 | 146.20 | 57.8 | |
hetero | FALSE | 54.51500 | 2 | M | 45.13 | 133.74 | 155.34 | 100.5 | |
hetero | FALSE | 54.51500 | 2 | N | 57.67 | 118.02 | 121.48 | 37.4 | |
hetero | TRUE | 60.04600 | 5 | A | 61.40 | 112.60 | 119.51 | 52.5 | |
hetero | TRUE | 60.04600 | 5 | B | 63.90 | 126.49 | 100.29 | 114.0 | |
hetero | TRUE | 60.04600 | 5 | G | 65.39 | 147.52 | 128.96 | 118.2 |
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
consonant <fctr> | initial <dbl> | intervocal <dbl> | final <dbl> | |
---|---|---|---|---|
stops | 123 | 57 | 30 | |
fricatives | 87 | 77 | 69 | |
affricates | 73 | 82 | 12 | |
nasals | 7 | 78 | 104 |
consonant <fctr> | position <chr> | number <dbl> | ||
---|---|---|---|---|
stops | initial | 123 | ||
fricatives | initial | 87 | ||
affricates | initial | 73 | ||
nasals | initial | 7 | ||
stops | intervocal | 57 | ||
fricatives | intervocal | 77 | ||
affricates | intervocal | 82 | ||
nasals | intervocal | 78 | ||
stops | final | 30 | ||
fricatives | final | 69 |
tidyr::gather()
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short
consonant <fctr> | initial <dbl> | intervocal <dbl> | final <dbl> | |
---|---|---|---|---|
stops | 123 | 57 | 30 | |
fricatives | 87 | 77 | 69 | |
affricates | 73 | 82 | 12 | |
nasals | 7 | 78 | 104 |
df.short %>%
gather(position, number, initial:final) ->
df.long
df.long
consonant <fctr> | position <chr> | number <dbl> | ||
---|---|---|---|---|
stops | initial | 123 | ||
fricatives | initial | 87 | ||
affricates | initial | 73 | ||
nasals | initial | 7 | ||
stops | intervocal | 57 | ||
fricatives | intervocal | 77 | ||
affricates | intervocal | 82 | ||
nasals | intervocal | 78 | ||
stops | final | 30 | ||
fricatives | final | 69 |
tidyr::spread()
df.long %>%
spread(position, number) ->
df.short
df.short
consonant <fctr> | final <dbl> | initial <dbl> | intervocal <dbl> | |
---|---|---|---|---|
affricates | 12 | 73 | 82 | |
fricatives | 69 | 87 | 77 | |
nasals | 104 | 7 | 78 | |
stops | 30 | 123 | 57 |
In Anscombe, F. J. (1973). “Graphs in Statistical Analysis” was presented the next sets of data:
quartet <- read.csv("https://goo.gl/KuuzYy")
quartet
x <int> | y <dbl> | dataset <int> | ||
---|---|---|---|---|
10 | 8.04 | 1 | ||
8 | 6.95 | 1 | ||
13 | 7.58 | 1 | ||
9 | 8.81 | 1 | ||
11 | 8.33 | 1 | ||
14 | 9.96 | 1 | ||
6 | 7.24 | 1 | ||
4 | 4.26 | 1 | ||
12 | 10.84 | 1 | ||
7 | 4.82 | 1 |
quartet %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 2)
mean_X <dbl> | mean_Y <dbl> | sd_X <dbl> | sd_Y <dbl> | cor <dbl> | n_obs <dbl> |
---|---|---|---|---|---|
9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |
9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |
9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |
9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |
In Matejka and Fitzmaurice (2017) “Same Stats, Different Graphs” was presented the next sets of data:
datasaurus <- read_tsv("https://goo.gl/gtaunr")
head(datasaurus)
dataset <chr> | x <dbl> | y <dbl> | ||
---|---|---|---|---|
dino | 55.3846 | 97.1795 | ||
dino | 51.5385 | 96.0256 | ||
dino | 46.1538 | 94.4872 | ||
dino | 42.8205 | 91.4103 | ||
dino | 40.7692 | 88.3333 | ||
dino | 38.7179 | 84.8718 |
datasaurus %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 1)
mean_X <dbl> | mean_Y <dbl> | sd_X <dbl> | sd_Y <dbl> | cor <dbl> | n_obs <dbl> |
---|---|---|---|---|---|
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
ggplot(data = homo, aes(s.duration.ms, vowel.duration.ms)) +
geom_point()
homo %>%
ggplot(aes(average.f0.Hz, age))+
geom_smooth(method = "lm")+
geom_point(aes(color = orientation))
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
color = orientation)) +
geom_point()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
shape = orientation)) +
geom_point(color = "green")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
size = age)) +
geom_point()
homo %>%
mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, fill = orientation)) +
geom_label()
homo %>%
mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, color = orientation)) +
geom_text()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
labs(title = "length of [s] vs. length of vowels",
subtitle = "based on 14 speakers of Cantonese",
caption = "data from [Hau 2007]")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
xlab("duration of [s] in ms")+
ylab("vowel duration in ms")
Lets use the frequency dictionary for Russian
freq <- read.csv("https://goo.gl/TlX7xW", sep = "\t")
freq %>%
arrange(desc(Freq.ipm.)) %>%
slice(1:200) %>%
ggplot(aes(Rank, Freq.ipm.)) +
geom_point() +
xlab("") +
ylab("ipm")
freq %>%
ggplot(aes(1:52138, Freq.ipm.))+
geom_point()+
xlab("")+
ylab("ipm")+
scale_y_log10()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
geom_point() +
geom_rug()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = mean(homo$vowel.duration.ms))+
geom_vline(xintercept = 60)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 2)+
geom_vline(xintercept = 60, size = 5)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 4)+
geom_vline(xintercept = 60, color = "blue")
Функция annotate
добавляет geom
к графику.
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
annotate(geom = "rect", xmin = 77, xmax = 79,
ymin = 117, ymax = 122, fill = "red", alpha = 0.2) +
annotate(geom = "text", x = 78, y = 125,
label = "Who is that?\n Outlier?")
There are two possible situations:
head(homo[, c(1, 9)])
speaker <chr> | orientation <chr> | |||
---|---|---|---|---|
A | hetero | |||
B | hetero | |||
C | homo | |||
D | homo | |||
E | homo | |||
F | homo |
head(homo[, c(1, 10)])
speaker <chr> | age <int> | |||
---|---|---|---|---|
A | 30 | |||
B | 19 | |||
C | 29 | |||
D | 36 | |||
E | 27 | |||
F | 33 |
homo %>%
ggplot(aes(orientation)) +
geom_bar()
homo %>%
ggplot(aes(speaker, age)) +
geom_col()
homo %>%
ggplot(aes(speaker, age, fill = orientation)) +
geom_col()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()+
geom_point()
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot() +
geom_jitter(width = 0.5)
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_violin() +
geom_jitter()
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
ggplot(aes(newvr, newvar, color = newvar))+
geom_jitter(width = 0.2)
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
group_by(newvar, newvr) %>%
summarise(number = n()) %>%
ggplot(aes(newvr, newvar, label = number))+
geom_point(aes(size = number, color = newvar))+
geom_text()+
scale_size(range = c(10, 30))+
guides(size = F)
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram()
How many histogram bins do we need?
nclass.Sturges(homo$s.duration.ms)
nclass.scott(homo$s.duration.ms)
nclass.FD(homo$s.duration.ms)
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(bins = nclass.FD(homo$s.duration.ms))
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(fill = "lightblue")
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density()
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density(color = "blue")
homo %>%
ggplot(aes(s.duration.ms)) +
geom_density(fill = "lightblue")
homo %>%
ggplot(aes(s.duration.ms, fill = orientation)) +
geom_density()
homo %>%
ggplot(aes(s.duration.ms, fill = orientation)) +
geom_density(alpha = 0.2)
library(ggridges)
homo %>%
ggplot(aes(s.duration.ms, orientation, fill = orientation)) +
geom_density_ridges()
Фасетизация наиболее сильное оружие ggplot2
, позволяющее разбить данные по одному или нескольким переменным и нанести награфик получившиеся подгруппы.
ggplot2::facet_wrap()
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation)
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free")
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free_x")
ggplot2::facet_grid()
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(older_then_28~orientation, scales = "free_x")
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x")
Существует еще славный аргумент margins
.
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x", margins = TRUE)
Иногда, очень хорошо показывать все данные на каждом фасете:
homo %>%
ggplot(aes(speaker, s.duration.ms))+
# Add an additional geom without facetization variable!
geom_point(data = homo[,-9], aes(speaker, s.duration.ms), color = "grey") +
geom_point() +
facet_wrap(~orientation)+
theme_bw()