• 1. Введение
  • 2. tible
  • 3. Чтение файлов: readr, readxl
  • 4. dplyr
    • 4.1 dplyr::filter()
    • 4.2 dplyr::slice()
    • 4.3 dplyr::select()
    • 4.4 dplyr::arrange()
    • 4.5 dplyr::distinct()
    • 4.6 dplyr::mutate()
    • 4.7 dplyr::group_by(...) %>% summarise(...)
    • 4.8 dplyr::.._join()
  • 5. tidyr package
  • 6.1 Anscombe’s quartet
  • 5.2 Datasaurus
  • 6.1 Scaterplot
  • 6.1.1 Scaterplot: color
  • 6.1.2 Scaterplot: shape
  • 6.1.3 Scaterplot: size
  • 6.1.4 Scaterplot: text
  • 6.1.5 Scaterplot: title
  • 6.1.6 Scaterplot: axis
  • 6.1.7 Log scales
  • 6.1.8 Scaterplot: rug
  • 6.1.9 Scaterplot: lines
  • 6.1.10 Scaterplot: annotate
  • 6.2.1 Barplots: basics
  • 6.3.1 Boxplots: basics, points, jitter
  • 6. Preliminary summary: two variables
  • 6.4.1 Histogram: basics
  • 6.5.1 Density plot
  • 6.7 Facets
    • 6.7.1 ggplot2::facet_wrap()
    • 6.7.2 ggplot2::facet_grid()

1. Введение

tidyverse — это набор пакетов:

  • dplyr, для преобразованиия данных
  • ggplot2, для визуализации
  • tidyr, для формата tidy data
  • readr, для чтения файлов в R
  • purrr, для функционального программирования
  • tibble, для работы с тибблами, современный вариант датафрейма

Полезно также знать:

  • readxl, для чтения .xls и .xlsx
  • jsonlite, для работы с JSON
  • rvest, для веб-скреппинга
  • lubridate, для работы с временем
  • tidytext, для работы с текстами и корпусами
  • broom, для перевода в tidy формат статистические модели
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.3.0
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

2. tible

head(iris)
as_tibble(iris)
data_frame(id = 1:12,
           letters = month.name)

3. Чтение файлов: readr, readxl

library(readr)
df <- read_csv("https://goo.gl/v7nvho")
head(df)
ABCDEFGHIJ0123456789
Gender
<chr>
Age
<int>
AgeGroup
<chr>
Education
<chr>
City
<chr>
SubjectCode
<chr>
Score
<chr>
GivenScore
<int>
Stimulus
<chr>
Prefix
<chr>
female16childschoolIzhevskAAA5utochnitu
female16childschoolIzhevskAAA5ozhestochito
female16childschoolIzhevskAAE1ovneshnito
female16childschoolIzhevskAAE1oblusito
female16childschoolIzhevskAAA5osvezhito
female16childschoolIzhevskAAA5oschastlivito
df <- read_tsv("https://goo.gl/33r2Ut")
head(df)
ABCDEFGHIJ0123456789
speaker
<chr>
year
<int>
gender
<chr>
cons
<int>
inn
<int>
total
<int>
ait19541954f222254476
anp19291929m151631
ans19251925f7883791167
avch19301930f43649
avi19581958f13132
avm19221922f7782351013
df <- read_delim("https://goo.gl/33r2Ut", delim = "\t")
head(df)
ABCDEFGHIJ0123456789
speaker
<chr>
year
<int>
gender
<chr>
cons
<int>
inn
<int>
total
<int>
ait19541954f222254476
anp19291929m151631
ans19251925f7883791167
avch19301930f43649
avi19581958f13132
avm19221922f7782351013
library(readxl)
xlsx_example <- readxl_example("datasets.xlsx")
df <- read_excel(xlsx_example)
head(df)
ABCDEFGHIJ0123456789
Sepal.Length
<dbl>
Sepal.Width
<dbl>
Petal.Length
<dbl>
Petal.Width
<dbl>
Species
<chr>
5.13.51.40.2setosa
4.93.01.40.2setosa
4.73.21.30.2setosa
4.63.11.50.2setosa
5.03.61.40.2setosa
5.43.91.70.4setosa
excel_sheets(xlsx_example)
## [1] "iris"     "mtcars"   "chickwts" "quakes"
df <- read_excel(xlsx_example, sheet = "mtcars")
head(df)
ABCDEFGHIJ0123456789
mpg
<dbl>
cyl
<dbl>
disp
<dbl>
hp
<dbl>
drat
<dbl>
wt
<dbl>
qsec
<dbl>
vs
<dbl>
am
<dbl>
gear
<dbl>
21.061601103.902.62016.46014
21.061601103.902.87517.02014
22.84108933.852.32018.61114
21.462581103.083.21519.44103
18.783601753.153.44017.02003
18.162251052.763.46020.22103
rm(df)

4. dplyr

homo <- read_csv("http://goo.gl/Zjr9aF")
homo
ABCDEFGHIJ0123456789
speaker
<chr>
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
perceived.as.homo
<int>
perceived.as.hetero
<int>
A61.40112.60119.5152.5718
B63.90126.49100.29114.0205
C55.08126.81114.90103.2916
D78.11119.17126.6158.81510
E64.7193.68130.7637.41015
F67.00127.87150.7942.0178
G65.39147.52128.96118.2205
H62.46120.13105.2655.7214
I60.45140.44109.8696.4205
J59.59121.01123.90111.7817

The majority of examples in that presentation are based on Hau 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:

  • [s] duration (s.duration.ms)
  • vowel duration (vowel.duration.ms)
  • fundamental frequencies mean (F0) (average.f0.Hz)
  • fundamental frequencies range (f0.range.Hz)
  • percentage of homosexual impression (perceived.as.homo)
  • percentage of heterosexal impression (perceived.as.hetero)
  • speakers orientation (orientation)
  • speakers age (age)

4.1 dplyr::filter()

How many speakers are older than 28?

homo %>%
  filter(age > 28, s.duration.ms < 60)
ABCDEFGHIJ0123456789
speaker
<chr>
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
perceived.as.homo
<int>
perceived.as.hetero
<int>
C55.08126.81114.90103.2916
J59.59121.01123.90111.7817
N57.67118.02121.4837.4421

%>% — конвеер (pipe) отправляет результат работы одной функции в другую.

sort(sqrt(abs(sin(1:22))), decreasing = TRUE)
##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>% 
  sin() %>% 
  abs() %>% 
  sqrt() %>% 
  sort(., decreasing = TRUE) # зачем здесь точка?
##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814

Конвееры в tidyverse пришли из пакета magrittr. Иногда они работают не корректно с функциями не из tidyverse.

4.2 dplyr::slice()

homo %>%
  slice(3:7)
ABCDEFGHIJ0123456789
speaker
<chr>
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
perceived.as.homo
<int>
perceived.as.hetero
<int>
C55.08126.81114.90103.2916
D78.11119.17126.6158.81510
E64.7193.68130.7637.41015
F67.00127.87150.7942.0178
G65.39147.52128.96118.2205

4.3 dplyr::select()

homo %>%
  select(8:10)
ABCDEFGHIJ0123456789
perceived.as.homo.percent
<dbl>
orientation
<chr>
age
<int>
0.28hetero30
0.80hetero19
0.36homo29
0.60homo36
0.40homo27
0.68homo33
0.80hetero28
0.84hetero22
0.80homo22
0.32homo40
homo %>%
  select(speaker:average.f0.Hz)
ABCDEFGHIJ0123456789
speaker
<chr>
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
A61.40112.60119.51
B63.90126.49100.29
C55.08126.81114.90
D78.11119.17126.61
E64.7193.68130.76
F67.00127.87150.79
G65.39147.52128.96
H62.46120.13105.26
I60.45140.44109.86
J59.59121.01123.90
homo %>%
  select(-speaker)
ABCDEFGHIJ0123456789
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
perceived.as.homo
<int>
perceived.as.hetero
<int>
61.40112.60119.5152.5718
63.90126.49100.29114.0205
55.08126.81114.90103.2916
78.11119.17126.6158.81510
64.7193.68130.7637.41015
67.00127.87150.7942.0178
65.39147.52128.96118.2205
62.46120.13105.2655.7214
60.45140.44109.8696.4205
59.59121.01123.90111.7817
homo %>%
  select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))
ABCDEFGHIJ0123456789
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
orientation
<chr>
age
<int>
61.40112.60119.5152.5hetero30
63.90126.49100.29114.0hetero19
55.08126.81114.90103.2homo29
78.11119.17126.6158.8homo36
64.7193.68130.7637.4homo27
67.00127.87150.7942.0homo33
65.39147.52128.96118.2hetero28
62.46120.13105.2655.7hetero22
60.45140.44109.8696.4homo22
59.59121.01123.90111.7homo40
homo %>%
  select(speaker, age, s.duration.ms)
ABCDEFGHIJ0123456789
speaker
<chr>
age
<int>
s.duration.ms
<dbl>
A3061.40
B1963.90
C2955.08
D3678.11
E2764.71
F3367.00
G2865.39
H2262.46
I2260.45
J4059.59

4.4 dplyr::arrange()

homo %>%
  arrange(orientation, desc(age))
ABCDEFGHIJ0123456789
speaker
<chr>
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
perceived.as.homo
<int>
perceived.as.hetero
<int>
A61.40112.60119.5152.5718
N57.67118.02121.4837.4421
G65.39147.52128.96118.2205
L53.31112.05146.2057.8817
H62.46120.13105.2655.7214
M45.13133.74155.34100.5916
B63.90126.49100.29114.0205
J59.59121.01123.90111.7817
D78.11119.17126.6158.81510
F67.00127.87150.7942.0178

4.5 dplyr::distinct()

homo %>%
  distinct(orientation)
ABCDEFGHIJ0123456789
orientation
<chr>
hetero
homo
homo %>%
  distinct(orientation, age > 20)
ABCDEFGHIJ0123456789
orientation
<chr>
age > 20
<lgl>
heteroTRUE
heteroFALSE
homoTRUE

4.6 dplyr::mutate()

homo %>%
  mutate(f0.mn = average.f0.Hz - f0.range.Hz/2,
         f0.mx = (average.f0.Hz + f0.range.Hz/2))
ABCDEFGHIJ0123456789
speaker
<chr>
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
perceived.as.homo
<int>
perceived.as.hetero
<int>
A61.40112.60119.5152.5718
B63.90126.49100.29114.0205
C55.08126.81114.90103.2916
D78.11119.17126.6158.81510
E64.7193.68130.7637.41015
F67.00127.87150.7942.0178
G65.39147.52128.96118.2205
H62.46120.13105.2655.7214
I60.45140.44109.8696.4205
J59.59121.01123.90111.7817

4.7 dplyr::group_by(...) %>% summarise(...)

homo %>%
  summarise(min(age), mean(s.duration.ms))
ABCDEFGHIJ0123456789
min(age)
<dbl>
mean(s.duration.ms)
<dbl>
1961.22429
homo %>%
  group_by(orientation) %>% 
  summarise(my_mean = mean(s.duration.ms))
ABCDEFGHIJ0123456789
orientation
<chr>
my_mean
<dbl>
hetero58.46571
homo63.98286
homo %>%
  group_by(orientation) %>% 
  summarise(mean(s.duration.ms))
ABCDEFGHIJ0123456789
orientation
<chr>
mean(s.duration.ms)
<dbl>
hetero58.46571
homo63.98286
homo %>%
  group_by(orientation) %>% 
  summarise(mean_by_orientation = mean(s.duration.ms))
ABCDEFGHIJ0123456789
orientation
<chr>
mean_by_orientation
<dbl>
hetero58.46571
homo63.98286

Если нужно посчитать количество вхождений, то можно использовать функцию n() в summarise() или же функцию count():

homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n())
ABCDEFGHIJ0123456789
orientation
<chr>
age > 20
<lgl>
my_mean
<dbl>
n_observations
<int>
heteroFALSE54.515002
heteroTRUE60.046005
homoTRUE63.982867
homo %>% 
  count(orientation, age > 20)
ABCDEFGHIJ0123456789
orientation
<chr>
age > 20
<lgl>
n
<int>
heteroFALSE2
heteroTRUE5
homoTRUE7

4.8 dplyr::.._join()

languages <- data_frame(
  languages = c("Selkup", "French", "Chukchi", "Kashubian"),
  countries = c("Russia", "France", "Russia", "Poland"),
  iso = c("sel", "fra", "ckt", "pol")
  )
languages
ABCDEFGHIJ0123456789
languages
<chr>
countries
<chr>
iso
<chr>
SelkupRussiasel
FrenchFrancefra
ChukchiRussiackt
KashubianPolandpol
country_population <- data_frame(
  countries = c("Russia", "Poland", "Finland"),
  population_mln = c(143, 38, 5))
country_population
ABCDEFGHIJ0123456789
countries
<chr>
population_mln
<dbl>
Russia143
Poland38
Finland5
inner_join(languages, country_population)
ABCDEFGHIJ0123456789
languages
<chr>
countries
<chr>
iso
<chr>
population_mln
<dbl>
SelkupRussiasel143
ChukchiRussiackt143
KashubianPolandpol38
left_join(languages, country_population)
ABCDEFGHIJ0123456789
languages
<chr>
countries
<chr>
iso
<chr>
population_mln
<dbl>
SelkupRussiasel143
FrenchFrancefraNA
ChukchiRussiackt143
KashubianPolandpol38
right_join(languages, country_population)
ABCDEFGHIJ0123456789
languages
<chr>
countries
<chr>
iso
<chr>
population_mln
<dbl>
SelkupRussiasel143
ChukchiRussiackt143
KashubianPolandpol38
NAFinlandNA5
anti_join(languages, country_population)
ABCDEFGHIJ0123456789
languages
<chr>
countries
<chr>
iso
<chr>
FrenchFrancefra
anti_join(country_population, languages)
ABCDEFGHIJ0123456789
countries
<chr>
population_mln
<dbl>
Finland5
full_join(country_population, languages)
ABCDEFGHIJ0123456789
countries
<chr>
population_mln
<dbl>
languages
<chr>
iso
<chr>
Russia143Selkupsel
Russia143Chukchickt
Poland38Kashubianpol
Finland5NANA
FranceNAFrenchfra

Существует достаточно забавный трюк, который позволяет использовать .._join() вместе с group_by() и summarise():

homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n())
ABCDEFGHIJ0123456789
orientation
<chr>
age > 20
<lgl>
my_mean
<dbl>
n_observations
<int>
heteroFALSE54.515002
heteroTRUE60.046005
homoTRUE63.982867
homo %>% 
  group_by(orientation, age > 20) %>% 
  summarise(my_mean = mean(s.duration.ms), n_observations = n()) %>% 
  left_join(homo)
ABCDEFGHIJ0123456789
orientation
<chr>
age > 20
<lgl>
my_mean
<dbl>
n_observations
<int>
speaker
<chr>
s.duration.ms
<dbl>
vowel.duration.ms
<dbl>
average.f0.Hz
<dbl>
f0.range.Hz
<dbl>
heteroFALSE54.515002A61.40112.60119.5152.5
heteroFALSE54.515002B63.90126.49100.29114.0
heteroFALSE54.515002G65.39147.52128.96118.2
heteroFALSE54.515002H62.46120.13105.2655.7
heteroFALSE54.515002L53.31112.05146.2057.8
heteroFALSE54.515002M45.13133.74155.34100.5
heteroFALSE54.515002N57.67118.02121.4837.4
heteroTRUE60.046005A61.40112.60119.5152.5
heteroTRUE60.046005B63.90126.49100.29114.0
heteroTRUE60.046005G65.39147.52128.96118.2

5. tidyr package

  • Short format
df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short
ABCDEFGHIJ0123456789
consonant
<fctr>
initial
<dbl>
intervocal
<dbl>
final
<dbl>
stops1235730
fricatives877769
affricates738212
nasals778104
  • Long format
ABCDEFGHIJ0123456789
consonant
<fctr>
position
<chr>
number
<dbl>
stopsinitial123
fricativesinitial87
affricatesinitial73
nasalsinitial7
stopsintervocal57
fricativesintervocal77
affricatesintervocal82
nasalsintervocal78
stopsfinal30
fricativesfinal69
  • Short format → Long format: tidyr::gather()
df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short
ABCDEFGHIJ0123456789
consonant
<fctr>
initial
<dbl>
intervocal
<dbl>
final
<dbl>
stops1235730
fricatives877769
affricates738212
nasals778104
df.short %>% 
  gather(position, number, initial:final) ->
  df.long
df.long
ABCDEFGHIJ0123456789
consonant
<fctr>
position
<chr>
number
<dbl>
stopsinitial123
fricativesinitial87
affricatesinitial73
nasalsinitial7
stopsintervocal57
fricativesintervocal77
affricatesintervocal82
nasalsintervocal78
stopsfinal30
fricativesfinal69
  • Long format → Short format: tidyr::spread()
df.long %>% 
  spread(position, number) ->
  df.short
df.short
ABCDEFGHIJ0123456789
consonant
<fctr>
final
<dbl>
initial
<dbl>
intervocal
<dbl>
affricates127382
fricatives698777
nasals104778
stops3012357

6.1 Anscombe’s quartet

In Anscombe, F. J. (1973). “Graphs in Statistical Analysis” was presented the next sets of data:

quartet <- read.csv("https://goo.gl/KuuzYy")
quartet
ABCDEFGHIJ0123456789
x
<int>
y
<dbl>
dataset
<int>
108.041
86.951
137.581
98.811
118.331
149.961
67.241
44.261
1210.841
74.821
quartet %>% 
  group_by(dataset) %>% 
  summarise(mean_X = mean(x),
            mean_Y = mean(y),
            sd_X = sd(x),
            sd_Y = sd(y),
            cor = cor(x, y),
            n_obs = n()) %>% 
  select(-dataset) %>% 
  round(., 2)
ABCDEFGHIJ0123456789
mean_X
<dbl>
mean_Y
<dbl>
sd_X
<dbl>
sd_Y
<dbl>
cor
<dbl>
n_obs
<dbl>
97.53.322.030.8211
97.53.322.030.8211
97.53.322.030.8211
97.53.322.030.8211

5.2 Datasaurus

In Matejka and Fitzmaurice (2017) “Same Stats, Different Graphs” was presented the next sets of data:

datasaurus <- read_tsv("https://goo.gl/gtaunr")
head(datasaurus)
ABCDEFGHIJ0123456789
dataset
<chr>
x
<dbl>
y
<dbl>
dino55.384697.1795
dino51.538596.0256
dino46.153894.4872
dino42.820591.4103
dino40.769288.3333
dino38.717984.8718

datasaurus %>% 
  group_by(dataset) %>% 
  summarise(mean_X = mean(x),
            mean_Y = mean(y),
            sd_X = sd(x),
            sd_Y = sd(y),
            cor = cor(x, y),
            n_obs = n()) %>% 
  select(-dataset) %>% 
  round(., 1)
ABCDEFGHIJ0123456789
mean_X
<dbl>
mean_Y
<dbl>
sd_X
<dbl>
sd_Y
<dbl>
cor
<dbl>
n_obs
<dbl>
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142
54.347.816.826.9-0.1142

6.1 Scaterplot

  • ggplot2
ggplot(data = homo, aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()

  • dplyr, ggplot2
homo %>%
  ggplot(aes(average.f0.Hz, age))+
  geom_smooth(method = "lm")+
  geom_point(aes(color = orientation))

6.1.1 Scaterplot: color

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             color = orientation)) +
  geom_point()

6.1.2 Scaterplot: shape

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             shape = orientation)) +
  geom_point(color = "green")

6.1.3 Scaterplot: size

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             size = age)) +
  geom_point()

6.1.4 Scaterplot: text

homo %>%
  mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>% 
  ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, fill = orientation)) +
  geom_label()

homo %>%
  mutate(label = ifelse(orientation == "homo","⚣", "⚤")) %>% 
  ggplot(aes(s.duration.ms, vowel.duration.ms, label = label, color = orientation)) +
  geom_text()

6.1.5 Scaterplot: title

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  labs(title = "length of [s] vs. length of vowels",
       subtitle = "based on 14 speakers of Cantonese",
       caption = "data from [Hau 2007]")

6.1.6 Scaterplot: axis

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  xlab("duration of [s] in ms")+
  ylab("vowel duration in ms")

6.1.7 Log scales

Lets use the frequency dictionary for Russian

freq <- read.csv("https://goo.gl/TlX7xW", sep = "\t")
freq %>%
  arrange(desc(Freq.ipm.)) %>% 
  slice(1:200) %>% 
  ggplot(aes(Rank, Freq.ipm.)) +
  geom_point() +
  xlab("") +
  ylab("ipm")

freq %>%
  ggplot(aes(1:52138, Freq.ipm.))+
  geom_point()+
  xlab("")+
  ylab("ipm")+
  scale_y_log10()

6.1.8 Scaterplot: rug

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
  geom_point() +
  geom_rug()

6.1.9 Scaterplot: lines

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = mean(homo$vowel.duration.ms))+
  geom_vline(xintercept = 60)

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = 120, linetype = 2)+
  geom_vline(xintercept = 60, size = 5)

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = 120, linetype = 4)+
  geom_vline(xintercept = 60, color = "blue")

6.1.10 Scaterplot: annotate

Функция annotate добавляет geom к графику.

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  annotate(geom = "rect", xmin = 77, xmax = 79,
           ymin = 117, ymax = 122, fill = "red", alpha = 0.2) + 
  annotate(geom = "text", x = 78, y = 125,
           label = "Who is that?\n Outlier?")

6.2.1 Barplots: basics

There are two possible situations:

  • not aggregate data
head(homo[, c(1, 9)])
ABCDEFGHIJ0123456789
speaker
<chr>
orientation
<chr>
Ahetero
Bhetero
Chomo
Dhomo
Ehomo
Fhomo
  • aggregate data
head(homo[, c(1, 10)])
ABCDEFGHIJ0123456789
speaker
<chr>
age
<int>
A30
B19
C29
D36
E27
F33
homo %>%
  ggplot(aes(orientation)) +
  geom_bar()

homo %>%
  ggplot(aes(speaker, age)) +
  geom_col()

homo %>%
  ggplot(aes(speaker, age, fill = orientation)) +
  geom_col()

6.3.1 Boxplots: basics, points, jitter

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot()

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot()+
  geom_point()

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot() +
  geom_jitter(width = 0.5)

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_violin() +
  geom_jitter()

6. Preliminary summary: two variables

  • scaterplot: two quantitative varibles
  • barplot: nominal varible and one number
  • boxplot: nominal varible and quantitative varibles
  • jittered points or sized points: two nominal varibles
mtcars %>% 
  mutate(newvar = mpg > 22,
         newvr = mpg < 17) %>% 
  ggplot(aes(newvr, newvar, color = newvar))+
  geom_jitter(width = 0.2)

mtcars %>% 
  mutate(newvar = mpg > 22,
         newvr = mpg < 17) %>% 
  group_by(newvar, newvr) %>% 
  summarise(number = n()) %>% 
  ggplot(aes(newvr, newvar, label = number))+
  geom_point(aes(size = number, color = newvar))+
  geom_text()+
  scale_size(range = c(10, 30))+
  guides(size = F)

6.4.1 Histogram: basics

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram()

How many histogram bins do we need?

  • [Sturgers 1926] nclass.Sturges(homo$s.duration.ms)
  • [Scott 1979] nclass.scott(homo$s.duration.ms)
  • [Freedman, Diaconis 1981] nclass.FD(homo$s.duration.ms)
homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram(bins = nclass.FD(homo$s.duration.ms))

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram(fill = "lightblue")

6.5.1 Density plot

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_density()

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_density(color = "blue")

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_density(fill = "lightblue")

homo %>%
  ggplot(aes(s.duration.ms, fill = orientation)) +
  geom_density()

homo %>%
  ggplot(aes(s.duration.ms, fill = orientation)) +
  geom_density(alpha = 0.2)

library(ggridges)
homo %>%
  ggplot(aes(s.duration.ms, orientation, fill = orientation)) +
  geom_density_ridges()

6.7 Facets

Фасетизация наиболее сильное оружие ggplot2, позволяющее разбить данные по одному или нескольким переменным и нанести награфик получившиеся подгруппы.

6.7.1 ggplot2::facet_wrap()

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation)

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation, scales = "free")

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation, scales = "free_x")

6.7.2 ggplot2::facet_grid()

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(older_then_28~orientation, scales = "free_x")

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_grid(older_then_28~orientation, scales = "free_x")

Существует еще славный аргумент margins.

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_grid(older_then_28~orientation, scales = "free_x", margins = TRUE)

Иногда, очень хорошо показывать все данные на каждом фасете:

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  # Add an additional geom without facetization variable!
  geom_point(data = homo[,-9], aes(speaker, s.duration.ms), color = "grey") + 
  geom_point() + 
  facet_wrap(~orientation)+
  theme_bw()




© Г. Мороз 2018 с помощью RMarkdown. Исходный код на GitHub