Data manipulation and visualization

tidyverse: dplyr, tidyr, ggplot2

02.02.2017

1.1 Data Types

c(42, 99, 43)
## [1] 42 99 43
matrix(1:12, nrow=3,ncol=4)
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12
list(
  n_lectures = 12,
  l_topics = c("begining", "data manipulation", "descriptive stats"),
  h_topics = c("data manipulations", "descriptive stats")
  )
## $n_lectures
## [1] 12
## 
## $l_topics
## [1] "begining"          "data manipulation" "descriptive stats"
## 
## $h_topics
## [1] "data manipulations" "descriptive stats"
data.frame(
  names = c("Olya", "Ilya", "Sasha", "George"),
  lecturer = c(TRUE, TRUE, FALSE, TRUE),
  lecturer_experience = c(19, 6, 0, 3)
  )
names lecturer lecturer_experience
Olya TRUE 19
Ilya TRUE 6
Sasha FALSE 0
George TRUE 3

See Data Type Conversion page

1.2 Data Frame exploration

There are some embedded data frames (e. g. mtcars, cars, iris). How many rows and columns?

nrow(iris) # returns the number of rows
## [1] 150
ncol(mtcars) #  returns the number of columns
## [1] 11
head(cars) # returns the first 6 rows
speed dist
4 2
4 10
7 4
7 22
8 16
9 10
head(cars, 4) # returns the first 4 rows
speed dist
4 2
4 10
7 4
7 22
tail(cars) # returns the last 6 rows
speed dist
45 23 54
46 24 70
47 24 92
48 24 93
49 24 120
50 25 85
summary(cars)  # produce some stats
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
str(cars)  # shows the structure: variables, their type
## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...
#View(cars)

1.3 Data Frame Indexing

mtcars$mpg # shows the mpg vector
##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
## [29] 15.8 19.7 15.0 21.4
mtcars[3,7] # shows the 3. row, 7. column
## [1] 18.61
mtcars[3,] # shows the 3. row
mpg cyl disp hp drat wt qsec vs am gear carb
Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
mtcars[,7] # shows the 7. column
##  [1] 16.46 17.02 18.61 19.44 17.02 20.22 15.84 20.00 22.90 18.30 18.90
## [12] 17.40 17.60 18.00 17.98 17.82 17.42 19.47 18.52 19.90 20.01 16.87
## [23] 17.30 15.41 17.05 18.90 16.70 16.90 14.50 15.50 14.60 18.60
mtcars[mtcars$mpg < 20, ] # show all rows with the mpg value lower then 20
mpg cyl disp hp drat wt qsec vs am gear carb
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
# ?mtcars

2. Tidyverse

The tidyverse is a set of packages:

Install tidyverse package using install.packages(“tidyverse”)

Load tidyverse package

library(tidyverse)

In this presentation the folowing version of tideverse, dplyr and ggplot2 are used:

packageVersion("tidyverse")
## [1] '1.1.1'
packageVersion("dplyr")
## [1] '0.7.2'
packageVersion("ggplot2")
## [1] '2.2.1'

3.1 Join dataframe by row or column

my_data_1 <- mtcars[5:20,] # select a subset of mtcars
my_data_2 <- mtcars[17:29,] # select a subset of mtcars
combine_rows <- rbind.data.frame(my_data_1, my_data_2)
nrow(my_data_1); nrow(my_data_2); nrow(combine_rows)
## [1] 16
## [1] 13
## [1] 29
my_data_3 <- mtcars[,3:7] # select a subset of mtcars
my_data_4 <- mtcars[,6:11] # select a subset of mtcars
combine_cols <- cbind.data.frame(my_data_3, my_data_4)
ncol(my_data_3); ncol(my_data_4); ncol(combine_cols)
## [1] 5
## [1] 6
## [1] 11

3.2 Joins (dplyr)

languages <- data.frame(
  languages = c("Selkup", "French", "Chukchi", "Kashubian"),
  countries = c("Russia", "France", "Russia", "Poland"),
  iso = c("sel", "fra", "ckt", "pol")
  )
languages
languages countries iso
Selkup Russia sel
French France fra
Chukchi Russia ckt
Kashubian Poland pol
country_population <- data.frame(
  countries = c("Russia", "Poland", "Finland"),
  population_mln = c(143, 38, 5))
country_population
countries population_mln
Russia 143
Poland 38
Finland 5
inner_join(languages, country_population)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
languages countries iso population_mln
Selkup Russia sel 143
Chukchi Russia ckt 143
Kashubian Poland pol 38
left_join(languages, country_population)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
languages countries iso population_mln
Selkup Russia sel 143
French France fra NA
Chukchi Russia ckt 143
Kashubian Poland pol 38
right_join(languages, country_population)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
languages countries iso population_mln
Selkup Russia sel 143
Chukchi Russia ckt 143
Kashubian Poland pol 38
NA Finland NA 5
anti_join(languages, country_population)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
languages countries iso
French France fra
anti_join(country_population, languages)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
countries population_mln
Finland 5
full_join(country_population, languages)
## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
countries population_mln languages iso
Russia 143 Selkup sel
Russia 143 Chukchi ckt
Poland 38 Kashubian pol
Finland 5 NA NA
France NA French fra

3.3 Data

The majority of examples in that presentation are based on Chi-kuk 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:

Download data

homo <- read.csv("http://goo.gl/Zjr9aF")
homo
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
A 61.40 112.60 119.51 52.5 7 18 0.28 hetero 30
B 63.90 126.49 100.29 114.0 20 5 0.80 hetero 19
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36
E 64.71 93.68 130.76 37.4 10 15 0.40 homo 27
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33
G 65.39 147.52 128.96 118.2 20 5 0.80 hetero 28
H 62.46 120.13 105.26 55.7 21 4 0.84 hetero 22
I 60.45 140.44 109.86 96.4 20 5 0.80 homo 22
J 59.59 121.01 123.90 111.7 8 17 0.32 homo 40
K 62.94 137.37 119.48 87.6 21 4 0.84 homo 30
L 53.31 112.05 146.20 57.8 8 17 0.32 hetero 25
M 45.13 133.74 155.34 100.5 9 16 0.36 hetero 20
N 57.67 118.02 121.48 37.4 4 21 0.16 hetero 29

3.4 Data Frame → Tibble (dplyr)

Tibble is a useful modification of Data Frame.

library(tidyverse)
homo <- tbl_df(homo)
homo
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
A 61.40 112.60 119.51 52.5 7 18 0.28 hetero 30
B 63.90 126.49 100.29 114.0 20 5 0.80 hetero 19
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36
E 64.71 93.68 130.76 37.4 10 15 0.40 homo 27
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33
G 65.39 147.52 128.96 118.2 20 5 0.80 hetero 28
H 62.46 120.13 105.26 55.7 21 4 0.84 hetero 22
I 60.45 140.44 109.86 96.4 20 5 0.80 homo 22
J 59.59 121.01 123.90 111.7 8 17 0.32 homo 40
K 62.94 137.37 119.48 87.6 21 4 0.84 homo 30
L 53.31 112.05 146.20 57.8 8 17 0.32 hetero 25
M 45.13 133.74 155.34 100.5 9 16 0.36 hetero 20
N 57.67 118.02 121.48 37.4 4 21 0.16 hetero 29

3.5 Filter (dplyr)

How many speakers are older than 28?

homo[homo$age > 28,]
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
A 61.40 112.60 119.51 52.5 7 18 0.28 hetero 30
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33
J 59.59 121.01 123.90 111.7 8 17 0.32 homo 40
K 62.94 137.37 119.48 87.6 21 4 0.84 homo 30
N 57.67 118.02 121.48 37.4 4 21 0.16 hetero 29
homo %>%
  filter(age > 28, s.duration.ms < 60)
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
J 59.59 121.01 123.90 111.7 8 17 0.32 homo 40
N 57.67 118.02 121.48 37.4 4 21 0.16 hetero 29

%>% is called pipe. Pipe is a technique for passing result of the work of one function to another.

sort(sqrt(abs(sin(1:22))), decreasing = TRUE)
##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>% 
  sin() %>% 
  abs() %>% 
  sqrt() %>% 
  sort(., decreasing = TRUE) # dot here shows where should argument be
##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814

Pipes in tidyverse package came from magritr package. Sometimes it works incorrectly with not tidyverse functions.

3.6 Slice (dplyr)

homo[3:7, ]
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36
E 64.71 93.68 130.76 37.4 10 15 0.40 homo 27
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33
G 65.39 147.52 128.96 118.2 20 5 0.80 hetero 28
homo %>%
  slice(3:7)
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36
E 64.71 93.68 130.76 37.4 10 15 0.40 homo 27
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33
G 65.39 147.52 128.96 118.2 20 5 0.80 hetero 28

3.7 Select (dplyr)

homo[, 8:10]
perceived.as.homo.percent orientation age
0.28 hetero 30
0.80 hetero 19
0.36 homo 29
0.60 homo 36
0.40 homo 27
0.68 homo 33
0.80 hetero 28
0.84 hetero 22
0.80 homo 22
0.32 homo 40
0.84 homo 30
0.32 hetero 25
0.36 hetero 20
0.16 hetero 29
homo %>%
  select(8:10)
perceived.as.homo.percent orientation age
0.28 hetero 30
0.80 hetero 19
0.36 homo 29
0.60 homo 36
0.40 homo 27
0.68 homo 33
0.80 hetero 28
0.84 hetero 22
0.80 homo 22
0.32 homo 40
0.84 homo 30
0.32 hetero 25
0.36 hetero 20
0.16 hetero 29
homo %>%
  select(speaker:average.f0.Hz)
speaker s.duration.ms vowel.duration.ms average.f0.Hz
A 61.40 112.60 119.51
B 63.90 126.49 100.29
C 55.08 126.81 114.90
D 78.11 119.17 126.61
E 64.71 93.68 130.76
F 67.00 127.87 150.79
G 65.39 147.52 128.96
H 62.46 120.13 105.26
I 60.45 140.44 109.86
J 59.59 121.01 123.90
K 62.94 137.37 119.48
L 53.31 112.05 146.20
M 45.13 133.74 155.34
N 57.67 118.02 121.48

It is possible to use select() function to remove columns:

homo %>%
  select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))
s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz orientation age
61.40 112.60 119.51 52.5 hetero 30
63.90 126.49 100.29 114.0 hetero 19
55.08 126.81 114.90 103.2 homo 29
78.11 119.17 126.61 58.8 homo 36
64.71 93.68 130.76 37.4 homo 27
67.00 127.87 150.79 42.0 homo 33
65.39 147.52 128.96 118.2 hetero 28
62.46 120.13 105.26 55.7 hetero 22
60.45 140.44 109.86 96.4 homo 22
59.59 121.01 123.90 111.7 homo 40
62.94 137.37 119.48 87.6 homo 30
53.31 112.05 146.20 57.8 hetero 25
45.13 133.74 155.34 100.5 hetero 20
57.67 118.02 121.48 37.4 hetero 29
# When you want to remove one column you can write it without 
# c() function, e. g. -speaker

It is possible to reorder columns using select() function:

homo %>%
  select(speaker, age, s.duration.ms)
speaker age s.duration.ms
A 30 61.40
B 19 63.90
C 29 55.08
D 36 78.11
E 27 64.71
F 33 67.00
G 28 65.39
H 22 62.46
I 22 60.45
J 40 59.59
K 30 62.94
L 25 53.31
M 20 45.13
N 29 57.67

3.8 arrange (dplyr)

homo[order(homo$orientation, homo$age), ]
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
B 63.90 126.49 100.29 114.0 20 5 0.80 hetero 19
M 45.13 133.74 155.34 100.5 9 16 0.36 hetero 20
H 62.46 120.13 105.26 55.7 21 4 0.84 hetero 22
L 53.31 112.05 146.20 57.8 8 17 0.32 hetero 25
G 65.39 147.52 128.96 118.2 20 5 0.80 hetero 28
N 57.67 118.02 121.48 37.4 4 21 0.16 hetero 29
A 61.40 112.60 119.51 52.5 7 18 0.28 hetero 30
I 60.45 140.44 109.86 96.4 20 5 0.80 homo 22
E 64.71 93.68 130.76 37.4 10 15 0.40 homo 27
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
K 62.94 137.37 119.48 87.6 21 4 0.84 homo 30
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36
J 59.59 121.01 123.90 111.7 8 17 0.32 homo 40
homo %>%
  arrange(orientation, desc(age))
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age
A 61.40 112.60 119.51 52.5 7 18 0.28 hetero 30
N 57.67 118.02 121.48 37.4 4 21 0.16 hetero 29
G 65.39 147.52 128.96 118.2 20 5 0.80 hetero 28
L 53.31 112.05 146.20 57.8 8 17 0.32 hetero 25
H 62.46 120.13 105.26 55.7 21 4 0.84 hetero 22
M 45.13 133.74 155.34 100.5 9 16 0.36 hetero 20
B 63.90 126.49 100.29 114.0 20 5 0.80 hetero 19
J 59.59 121.01 123.90 111.7 8 17 0.32 homo 40
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33
K 62.94 137.37 119.48 87.6 21 4 0.84 homo 30
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29
E 64.71 93.68 130.76 37.4 10 15 0.40 homo 27
I 60.45 140.44 109.86 96.4 20 5 0.80 homo 22

3.9 distinct

unique(homo$orientation)
## [1] hetero homo  
## Levels: hetero homo
homo %>%
  distinct(orientation, age > 20)
orientation age > 20
hetero TRUE
hetero FALSE
homo TRUE
unique(homo[c("orientation", "perceived.as.homo")])
orientation perceived.as.homo
hetero 7
hetero 20
homo 9
homo 15
homo 10
homo 17
hetero 21
homo 20
homo 8
homo 21
hetero 8
hetero 9
hetero 4
homo %>%
  distinct(orientation, perceived.as.homo)
perceived.as.homo orientation
7 hetero
20 hetero
9 homo
15 homo
10 homo
17 homo
21 hetero
20 homo
8 homo
21 homo
8 hetero
9 hetero
4 hetero

3.10 mutate (dplyr)

homo$f0.min <- homo$average.f0.Hz - homo$f0.range.Hz/2
homo$f0.min
##  [1]  93.26  43.29  63.30  97.21 112.06 129.79  69.86  77.41  61.66  68.05
## [11]  75.68 117.30 105.09 102.78
homo$f0.max <- homo$average.f0.Hz + homo$f0.range.Hz/2
homo$f0.max
##  [1] 145.76 157.29 166.50 156.01 149.46 171.79 188.06 133.11 158.06 179.75
## [11] 163.28 175.10 205.59 140.18
homo %>%
  mutate( f0.mn = average.f0.Hz - f0.range.Hz/2,
          f0.mx = (average.f0.Hz + f0.range.Hz/2)) -> 
  homo
homo  
speaker s.duration.ms vowel.duration.ms average.f0.Hz f0.range.Hz perceived.as.homo perceived.as.hetero perceived.as.homo.percent orientation age f0.min f0.max f0.mn f0.mx
A 61.40 112.60 119.51 52.5 7 18 0.28 hetero 30 93.26 145.76 93.26 145.76
B 63.90 126.49 100.29 114.0 20 5 0.80 hetero 19 43.29 157.29 43.29 157.29
C 55.08 126.81 114.90 103.2 9 16 0.36 homo 29 63.30 166.50 63.30 166.50
D 78.11 119.17 126.61 58.8 15 10 0.60 homo 36 97.21 156.01 97.21 156.01
E 64.71 93.68 130.76 37.4 10 15 0.40 homo 27 112.06 149.46 112.06 149.46
F 67.00 127.87 150.79 42.0 17 8 0.68 homo 33 129.79 171.79 129.79 171.79
G 65.39 147.52 128.96 118.2 20 5 0.80 hetero 28 69.86 188.06 69.86 188.06
H 62.46 120.13 105.26 55.7 21 4 0.84 hetero 22 77.41 133.11 77.41 133.11
I 60.45 140.44 109.86 96.4 20 5 0.80 homo 22 61.66 158.06 61.66 158.06
J 59.59 121.01 123.90 111.7 8 17 0.32 homo 40 68.05 179.75 68.05 179.75
K 62.94 137.37 119.48 87.6 21 4 0.84 homo 30 75.68 163.28 75.68 163.28
L 53.31 112.05 146.20 57.8 8 17 0.32 hetero 25 117.30 175.10 117.30 175.10
M 45.13 133.74 155.34 100.5 9 16 0.36 hetero 20 105.09 205.59 105.09 205.59
N 57.67 118.02 121.48 37.4 4 21 0.16 hetero 29 102.78 140.18 102.78 140.18

3.11 group_by and summarise (dplyr)

homo %>%
  summarise(min(age), mean(s.duration.ms))
min(age) mean(s.duration.ms)
19 61.22429
homo %>%
  group_by(orientation) %>% 
  summarise(my_mean = mean(s.duration.ms))
orientation my_mean
hetero 58.46571
homo 63.98286
homo %>%
  group_by(orientation) %>% 
  summarise(mean(s.duration.ms))
orientation mean(s.duration.ms)
hetero 58.46571
homo 63.98286
homo %>%
  group_by(orientation) %>% 
  summarise(mean_by_orientation = mean(s.duration.ms))
orientation mean_by_orientation
hetero 58.46571
homo 63.98286

4.1. tidyr package

df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short
consonant initial intervocal final
stops 123 57 30
fricatives 87 77 69
affricates 73 82 12
nasals 7 78 104
consonant position number
stops initial 123
fricatives initial 87
affricates initial 73
nasals initial 7
stops intervocal 57
fricatives intervocal 77
affricates intervocal 82
nasals intervocal 78
stops final 30
fricatives final 69
affricates final 12
nasals final 104

4.2 Short format → Long format: gather (tidyr)

df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short
consonant initial intervocal final
stops 123 57 30
fricatives 87 77 69
affricates 73 82 12
nasals 7 78 104
df.short %>% 
  gather(position, number, initial:final) ->
  df.long
df.long
consonant position number
stops initial 123
fricatives initial 87
affricates initial 73
nasals initial 7
stops intervocal 57
fricatives intervocal 77
affricates intervocal 82
nasals intervocal 78
stops final 30
fricatives final 69
affricates final 12
nasals final 104

4.2 Long format → Short format: spread (tidyr)

df.long %>% 
  spread(position, number) ->
  df.short
df.short
consonant final initial intervocal
affricates 12 73 82
fricatives 69 87 77
nasals 104 7 78
stops 30 123 57

5.1 Anscombe’s quartet

In Anscombe, F. J. (1973). “Graphs in Statistical Analysis” was presented the next sets of data:

quartet <- read.csv("https://goo.gl/KuuzYy")
quartet
x y dataset
10 8.04 1
8 6.95 1
13 7.58 1
9 8.81 1
11 8.33 1
14 9.96 1
6 7.24 1
4 4.26 1
12 10.84 1
7 4.82 1
5 5.68 1
10 9.14 2
8 8.14 2
13 8.74 2
9 8.77 2
11 9.26 2
14 8.10 2
6 6.13 2
4 3.10 2
12 9.13 2
7 7.26 2
5 4.74 2
10 7.46 3
8 6.77 3
13 12.74 3
9 7.11 3
11 7.81 3
14 8.84 3
6 6.08 3
4 5.39 3
12 8.15 3
7 6.42 3
5 5.73 3
8 6.58 4
8 5.76 4
8 7.71 4
8 8.84 4
8 8.47 4
8 7.04 4
8 5.25 4
19 12.50 4
8 5.56 4
8 7.91 4
8 6.89 4
quartet %>% 
  group_by(dataset) %>% 
  summarise(mean_X = mean(x),
            mean_Y = mean(y),
            sd_X = sd(x),
            sd_Y = sd(y),
            cor = cor(x, y),
            n_obs = n()) %>% 
  select(-dataset) %>% 
  round(., 2)
mean_X mean_Y sd_X sd_Y cor n_obs
9 7.5 3.32 2.03 0.82 11
9 7.5 3.32 2.03 0.82 11
9 7.5 3.32 2.03 0.82 11
9 7.5 3.32 2.03 0.82 11

5.2 Datasaurus

In Matejka and Fitzmaurice (2017) “Same Stats, Different Graphs” was presented the next sets of data:

datasaurus <- read_tsv("https://goo.gl/gtaunr")
head(datasaurus)
dataset x y
dino 55.3846 97.1795
dino 51.5385 96.0256
dino 46.1538 94.4872
dino 42.8205 91.4103
dino 40.7692 88.3333
dino 38.7179 84.8718

datasaurus %>% 
  group_by(dataset) %>% 
  summarise(mean_X = mean(x),
            mean_Y = mean(y),
            sd_X = sd(x),
            sd_Y = sd(y),
            cor = cor(x, y),
            n_obs = n()) %>% 
  select(-dataset) %>% 
  round(., 1)
mean_X mean_Y sd_X sd_Y cor n_obs
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142
54.3 47.8 16.8 26.9 -0.1 142

6.1 Scaterplot

plot(homo$s.duration.ms, homo$vowel.duration.ms)

ggplot(data = homo, aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()

homo %>%
  ggplot(aes(average.f0.Hz, age))+
  geom_smooth(method = "lm")+
  geom_point(aes(color = orientation))

6.1.1 Scaterplot: color

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     col = c("red", "blue")[homo$orientation])

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             color = orientation)) +
  geom_point()

6.1.2 Scaterplot: shape

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     pch = c(16, 17)[homo$orientation])

plot(1:25, pch = 1:25)

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             shape = orientation)) +
  geom_point(color = "green")

6.1.3 Scaterplot: size

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     cex = homo$age/20)

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     cex = homo$age)

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             size = age)) +
  geom_point()

6.1.4 Scaterplot: text

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     pch = c("⚤", "⚣")[homo$orientation])

levels(homo$orientation) <- c("⚣", "⚤")
homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms, label = orientation, fill = orientation)) +
  geom_label()

6.1.5 Scaterplot: title

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     main = "length of [s] vs. length of vowels")

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  labs(title = "length of [s] vs. length of vowels",
       subtitle = "based on 14 speakers of Cantonese",
       caption = "data from [Chi kuk 2007]")

6.1.6 Scaterplot: axis

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     xlab = "duration of [s] in ms", ylab = "vowel duration in ms")

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  xlab("duration of [s] in ms")+
  ylab("vowel duration in ms")

6.1.7 Log scales

Lets use the frequency dictionary for Russian

freq <- read.csv("https://goo.gl/TlX7xW", sep = "\t")
freq %>%
  arrange(desc(Freq.ipm.)) %>% 
  slice(1:200) %>% 
  ggplot(aes(Rank, Freq.ipm.)) +
  geom_point() +
  xlab("") +
  ylab("ipm")

plot(1:52138, freq$Freq.ipm.,
  xlab = NA, ylab = "ipm",
  las = 1,
  log = "yx")

freq %>%
  ggplot(aes(1:52138, Freq.ipm.))+
  geom_point()+
  xlab("")+
  ylab("ipm")+
  scale_y_log10()

6.1.8 Scaterplot: rug

plot(homo$s.duration.ms, homo$vowel.duration.ms)
rug(homo$s.duration.ms)
rug(homo$vowel.duration.ms, side = 2)

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
  geom_point() +
  geom_rug()

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
  geom_point() +
  geom_rug()

6.1.9 Scaterplot: lines

plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, v = 60)

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = mean(homo$vowel.duration.ms))+
  geom_vline(xintercept = 60)

plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, lty = 2)
abline(v = 60, lwd = 42)

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = 120, linetype = 2)+
  geom_vline(xintercept = 60, size = 5)

plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, lty = 4)
abline(v = 60, col = "blue")

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = 120, linetype = 4)+
  geom_vline(xintercept = 60, color = "blue")

6.1.10 Scaterplot: annotate

The function annotate adds geoms to a plot.

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  annotate(geom = "rect", xmin = 77, xmax = 79,
             ymin = 117, ymax = 122, fill = "red", alpha = 0.2) + 
  annotate(geom = "text", x = 78, y = 125,
             label = "Who is that?\n Outlier?")

6.2.1 Barplots: basics

There are two possible situations:

head(homo[, c(1, 9)])
speaker orientation
A homo
B homo
C hetero
D hetero
E hetero
F hetero
head(homo[, c(1, 10)])
speaker age
A 30
B 19
C 29
D 36
E 27
F 33

Not aggregate data

barplot(table(homo$orientation))

homo %>%
  ggplot(aes(orientation)) +
  geom_bar()

Aggregate data

barplot(homo$age, names.arg = homo$speaker)

homo %>%
  ggplot(aes(speaker, age)) +
  geom_bar(stat = "identity")

6.2.2 Barplots: color

barplot(homo$age, names.arg = homo$speaker,
  col = homo$orientation)

* dplyr, ggplot2

homo %>%
  ggplot(aes(speaker, age, fill = orientation)) +
  geom_bar(stat = "identity")

6.3.1 Boxplots: basics

boxplot(homo$s.duration.ms~homo$orientation)

* dplyr, ggplot2

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot()

6.3.2 Boxplots: points

boxplot(homo$s.duration.ms~homo$orientation)
stripchart(homo$s.duration.ms ~ homo$orientation,
  pch = 1, vertical = T, add = T)

* dplyr, ggplot2

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot()+
  geom_point()

6.3.3 Boxplots: jitter

boxplot(homo$s.duration.ms~homo$orientation)
stripchart(homo$s.duration.ms~homo$orientation,
  pch = 1, vertical = T, add = T, method = "jitter")

* dplyr, ggplot2

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot() +
  geom_jitter(width = 0.5)

6.3.3 Boxplots: jitter

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_violin() +
  geom_jitter()

6. Preliminary summary: two variables

mtcars %>% 
  mutate(newvar = mpg > 22,
         newvr = mpg < 17) %>% 
  ggplot(aes(newvr, newvar, color = newvar))+
  geom_jitter(width = 0.2)

mtcars %>% 
  mutate(newvar = mpg > 22,
         newvr = mpg < 17) %>% 
  group_by(newvar, newvr) %>% 
  summarise(number = n()) %>% 
  ggplot(aes(newvr, newvar, label = number))+
  geom_point(aes(size = number, color = newvar))+
  geom_text()+
  scale_size(range = c(10, 30))+
  guides(size = F)

6.6.1 Histogram: basics

hist(homo$s.duration.ms)

* dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How many histogram bins do we need?

hist(homo$s.duration.ms,
     breaks = nclass.FD(homo$s.duration.ms))

* dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram(bins = nclass.FD(homo$s.duration.ms))

6.6.2 Histogram: color

hist(homo$s.duration.ms, col = "lightblue")

* dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram(fill = "lightblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

6.7 Facets

Facetization is the most powerful ggplot2 tool that allow to split up your data by one or more variables and plot the subsets of data together.

facet_wrap()

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation)

You can see that there are all speakers on both graph, but only certain speakers have dot value. It is because by default scales of facets are equal.

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation, scales = "free")

It is possible to make only one scale “free”:

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation, scales = "free_x")

facet_grid()

It is possible to facet using more then one variable.

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(older_then_28~orientation, scales = "free_x")

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_grid(older_then_28~orientation, scales = "free_x")

There is a nice argument margins.

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_grid(older_then_28~orientation, scales = "free_x", margins = TRUE)

Sometimes it is nice to put your data in all facets:

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  # Add an additional geom without facetization variable!
  geom_point(data = homo[,-9], aes(speaker, s.duration.ms), color = "grey") + 
  geom_point() + 
  facet_wrap(~orientation)