Data manipulation and visualization

tidyverse: dplyr, tidyr, ggplot2

02.02.2017

1.1 Data Types

Vector

c(42, 99, 43)

## [1] 42 99 43

Matrix, Array…

matrix(1:12, nrow=3,ncol=4)

##      [,1] [,2] [,3] [,4]
## [1,]    1    4    7   10
## [2,]    2    5    8   11
## [3,]    3    6    9   12

List

list(
  n_lectures = 12,
  l_topics = c("begining", "data manipulation", "descriptive stats"),
  h_topics = c("data manipulations", "descriptive stats")
  )

## $n_lectures
## [1] 12
## 
## $l_topics
## [1] "begining"          "data manipulation" "descriptive stats"
## 
## $h_topics
## [1] "data manipulations" "descriptive stats"

Data Frame

data.frame(
  names = c("Olya", "Ilya", "Sasha", "George"),
  lecturer = c(TRUE, TRUE, FALSE, TRUE),
  lecturer_experience = c(19, 6, 0, 3)
  )

names	lecturer	lecturer_experience
Olya	TRUE	19
Ilya	TRUE	6
Sasha	FALSE	0
George	TRUE	3

See Data Type Conversion page

1.2 Data Frame exploration

There are some embedded data frames (e. g. mtcars, cars, iris). How many rows and columns?

nrow(iris) # returns the number of rows

## [1] 150

ncol(mtcars) #  returns the number of columns

## [1] 11

head(cars) # returns the first 6 rows

speed	dist
4	2
4	10
7	4
7	22
8	16
9	10

head(cars, 4) # returns the first 4 rows

speed	dist
4	2
4	10
7	4
7	22

tail(cars) # returns the last 6 rows

	speed	dist
45	23	54
46	24	70
47	24	92
48	24	93
49	24	120
50	25	85

summary(cars)  # produce some stats

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

str(cars)  # shows the structure: variables, their type

## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...

#View(cars)

1.3 Data Frame Indexing

mtcars$mpg # shows the mpg vector

##  [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
## [29] 15.8 19.7 15.0 21.4

mtcars[3,7] # shows the 3. row, 7. column

## [1] 18.61

mtcars[3,] # shows the 3. row

	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
Datsun 710	22.8	4	108	93	3.85	2.32	18.61	1	1	4	1

mtcars[,7] # shows the 7. column

##  [1] 16.46 17.02 18.61 19.44 17.02 20.22 15.84 20.00 22.90 18.30 18.90
## [12] 17.40 17.60 18.00 17.98 17.82 17.42 19.47 18.52 19.90 20.01 16.87
## [23] 17.30 15.41 17.05 18.90 16.70 16.90 14.50 15.50 14.60 18.60

mtcars[mtcars$mpg < 20, ] # show all rows with the mpg value lower then 20

	mpg	cyl	disp	hp	drat	wt	qsec	vs	am	gear	carb
Hornet Sportabout	18.7	8	360.0	175	3.15	3.440	17.02	0	0	3	2
Valiant	18.1	6	225.0	105	2.76	3.460	20.22	1	0	3	1
Duster 360	14.3	8	360.0	245	3.21	3.570	15.84	0	0	3	4
Merc 280	19.2	6	167.6	123	3.92	3.440	18.30	1	0	4	4
Merc 280C	17.8	6	167.6	123	3.92	3.440	18.90	1	0	4	4
Merc 450SE	16.4	8	275.8	180	3.07	4.070	17.40	0	0	3	3
Merc 450SL	17.3	8	275.8	180	3.07	3.730	17.60	0	0	3	3
Merc 450SLC	15.2	8	275.8	180	3.07	3.780	18.00	0	0	3	3
Cadillac Fleetwood	10.4	8	472.0	205	2.93	5.250	17.98	0	0	3	4
Lincoln Continental	10.4	8	460.0	215	3.00	5.424	17.82	0	0	3	4
Chrysler Imperial	14.7	8	440.0	230	3.23	5.345	17.42	0	0	3	4
Dodge Challenger	15.5	8	318.0	150	2.76	3.520	16.87	0	0	3	2
AMC Javelin	15.2	8	304.0	150	3.15	3.435	17.30	0	0	3	2
Camaro Z28	13.3	8	350.0	245	3.73	3.840	15.41	0	0	3	4
Pontiac Firebird	19.2	8	400.0	175	3.08	3.845	17.05	0	0	3	2
Ford Pantera L	15.8	8	351.0	264	4.22	3.170	14.50	0	1	5	4
Ferrari Dino	19.7	6	145.0	175	3.62	2.770	15.50	0	1	5	6
Maserati Bora	15.0	8	301.0	335	3.54	3.570	14.60	0	1	5	8

# ?mtcars

2. Tidyverse

The tidyverse is a set of packages:

dplyr, for data manipulation
ggplot2, for data visualisation
tidyr, for data tidying
readr, for data import
purrr, for functional programming
tibble, for tibbles, a modern re-imagining of data frames

Install tidyverse package using install.packages(“tidyverse”)

Load tidyverse package

library(tidyverse)

In this presentation the folowing version of tideverse, dplyr and ggplot2 are used:

packageVersion("tidyverse")

## [1] '1.1.1'

packageVersion("dplyr")

## [1] '0.7.2'

packageVersion("ggplot2")

## [1] '2.2.1'

3.1 Join dataframe by row or column

my_data_1 <- mtcars[5:20,] # select a subset of mtcars
my_data_2 <- mtcars[17:29,] # select a subset of mtcars
combine_rows <- rbind.data.frame(my_data_1, my_data_2)
nrow(my_data_1); nrow(my_data_2); nrow(combine_rows)

## [1] 16

## [1] 13

## [1] 29

my_data_3 <- mtcars[,3:7] # select a subset of mtcars
my_data_4 <- mtcars[,6:11] # select a subset of mtcars
combine_cols <- cbind.data.frame(my_data_3, my_data_4)
ncol(my_data_3); ncol(my_data_4); ncol(combine_cols)

## [1] 5

## [1] 6

## [1] 11

3.2 Joins (dplyr)

languages <- data.frame(
  languages = c("Selkup", "French", "Chukchi", "Kashubian"),
  countries = c("Russia", "France", "Russia", "Poland"),
  iso = c("sel", "fra", "ckt", "pol")
  )
languages

languages	countries	iso
Selkup	Russia	sel
French	France	fra
Chukchi	Russia	ckt
Kashubian	Poland	pol

country_population <- data.frame(
  countries = c("Russia", "Poland", "Finland"),
  population_mln = c(143, 38, 5))
country_population

countries	population_mln
Russia	143
Poland	38
Finland	5

inner_join(languages, country_population)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing
## to character vector

languages	countries	iso	population_mln
Selkup	Russia	sel	143
Chukchi	Russia	ckt	143
Kashubian	Poland	pol	38

left_join(languages, country_population)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing
## to character vector

languages	countries	iso	population_mln
Selkup	Russia	sel	143
French	France	fra	NA
Chukchi	Russia	ckt	143
Kashubian	Poland	pol	38

right_join(languages, country_population)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing
## to character vector

languages	countries	iso	population_mln
Selkup	Russia	sel	143
Chukchi	Russia	ckt	143
Kashubian	Poland	pol	38
NA	Finland	NA	5

anti_join(languages, country_population)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing
## to character vector

languages	countries	iso
French	France	fra

anti_join(country_population, languages)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing
## to character vector

countries	population_mln
Finland	5

full_join(country_population, languages)

## Joining, by = "countries"

## Warning: Column `countries` joining factors with different levels, coercing
## to character vector

countries	population_mln	languages	iso
Russia	143	Selkup	sel
Russia	143	Chukchi	ckt
Poland	38	Kashubian	pol
Finland	5	NA	NA
France	NA	French	fra

3.3 Data

The majority of examples in that presentation are based on Chi-kuk 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:

[s] duration (s.duration.ms)
vowel duration (vowel.duration.ms)
fundamental frequencies mean (F0) (average.f0.Hz)
fundamental frequencies range (f0.range.Hz)
percentage of homosexual impression (perceived.as.homo)
percentage of heterosexal impression (perceived.as.hetero)
speakers orientation (orientation)
speakers age (age)

Download data

homo <- read.csv("http://goo.gl/Zjr9aF")
homo

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
A	61.40	112.60	119.51	52.5	7	18	0.28	hetero	30
B	63.90	126.49	100.29	114.0	20	5	0.80	hetero	19
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36
E	64.71	93.68	130.76	37.4	10	15	0.40	homo	27
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33
G	65.39	147.52	128.96	118.2	20	5	0.80	hetero	28
H	62.46	120.13	105.26	55.7	21	4	0.84	hetero	22
I	60.45	140.44	109.86	96.4	20	5	0.80	homo	22
J	59.59	121.01	123.90	111.7	8	17	0.32	homo	40
K	62.94	137.37	119.48	87.6	21	4	0.84	homo	30
L	53.31	112.05	146.20	57.8	8	17	0.32	hetero	25
M	45.13	133.74	155.34	100.5	9	16	0.36	hetero	20
N	57.67	118.02	121.48	37.4	4	21	0.16	hetero	29

3.4 Data Frame → Tibble (dplyr)

Tibble is a useful modification of Data Frame.

library(tidyverse)
homo <- tbl_df(homo)
homo

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
A	61.40	112.60	119.51	52.5	7	18	0.28	hetero	30
B	63.90	126.49	100.29	114.0	20	5	0.80	hetero	19
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36
E	64.71	93.68	130.76	37.4	10	15	0.40	homo	27
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33
G	65.39	147.52	128.96	118.2	20	5	0.80	hetero	28
H	62.46	120.13	105.26	55.7	21	4	0.84	hetero	22
I	60.45	140.44	109.86	96.4	20	5	0.80	homo	22
J	59.59	121.01	123.90	111.7	8	17	0.32	homo	40
K	62.94	137.37	119.48	87.6	21	4	0.84	homo	30
L	53.31	112.05	146.20	57.8	8	17	0.32	hetero	25
M	45.13	133.74	155.34	100.5	9	16	0.36	hetero	20
N	57.67	118.02	121.48	37.4	4	21	0.16	hetero	29

3.5 Filter (dplyr)

How many speakers are older than 28?

base R

homo[homo$age > 28,]

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
A	61.40	112.60	119.51	52.5	7	18	0.28	hetero	30
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33
J	59.59	121.01	123.90	111.7	8	17	0.32	homo	40
K	62.94	137.37	119.48	87.6	21	4	0.84	homo	30
N	57.67	118.02	121.48	37.4	4	21	0.16	hetero	29

dplyr

homo %>%
  filter(age > 28, s.duration.ms < 60)

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
J	59.59	121.01	123.90	111.7	8	17	0.32	homo	40
N	57.67	118.02	121.48	37.4	4	21	0.16	hetero	29

%>% is called pipe. Pipe is a technique for passing result of the work of one function to another.

sort(sqrt(abs(sin(1:22))), decreasing = TRUE)

##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814

1:22 %>% 
  sin() %>% 
  abs() %>% 
  sqrt() %>% 
  sort(., decreasing = TRUE) # dot here shows where should argument be

##  [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
##  [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814

Pipes in tidyverse package came from magritr package. Sometimes it works incorrectly with not tidyverse functions.

3.6 Slice (dplyr)

base R

homo[3:7, ]

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36
E	64.71	93.68	130.76	37.4	10	15	0.40	homo	27
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33
G	65.39	147.52	128.96	118.2	20	5	0.80	hetero	28

dplyr

homo %>%
  slice(3:7)

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36
E	64.71	93.68	130.76	37.4	10	15	0.40	homo	27
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33
G	65.39	147.52	128.96	118.2	20	5	0.80	hetero	28

3.7 Select (dplyr)

base R

homo[, 8:10]

perceived.as.homo.percent	orientation	age
0.28	hetero	30
0.80	hetero	19
0.36	homo	29
0.60	homo	36
0.40	homo	27
0.68	homo	33
0.80	hetero	28
0.84	hetero	22
0.80	homo	22
0.32	homo	40
0.84	homo	30
0.32	hetero	25
0.36	hetero	20
0.16	hetero	29

dplyr

homo %>%
  select(8:10)

perceived.as.homo.percent	orientation	age
0.28	hetero	30
0.80	hetero	19
0.36	homo	29
0.60	homo	36
0.40	homo	27
0.68	homo	33
0.80	hetero	28
0.84	hetero	22
0.80	homo	22
0.32	homo	40
0.84	homo	30
0.32	hetero	25
0.36	hetero	20
0.16	hetero	29

dplyr only

homo %>%
  select(speaker:average.f0.Hz)

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz
A	61.40	112.60	119.51
B	63.90	126.49	100.29
C	55.08	126.81	114.90
D	78.11	119.17	126.61
E	64.71	93.68	130.76
F	67.00	127.87	150.79
G	65.39	147.52	128.96
H	62.46	120.13	105.26
I	60.45	140.44	109.86
J	59.59	121.01	123.90
K	62.94	137.37	119.48
L	53.31	112.05	146.20
M	45.13	133.74	155.34
N	57.67	118.02	121.48

It is possible to use select() function to remove columns:

homo %>%
  select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))

s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	orientation	age
61.40	112.60	119.51	52.5	hetero	30
63.90	126.49	100.29	114.0	hetero	19
55.08	126.81	114.90	103.2	homo	29
78.11	119.17	126.61	58.8	homo	36
64.71	93.68	130.76	37.4	homo	27
67.00	127.87	150.79	42.0	homo	33
65.39	147.52	128.96	118.2	hetero	28
62.46	120.13	105.26	55.7	hetero	22
60.45	140.44	109.86	96.4	homo	22
59.59	121.01	123.90	111.7	homo	40
62.94	137.37	119.48	87.6	homo	30
53.31	112.05	146.20	57.8	hetero	25
45.13	133.74	155.34	100.5	hetero	20
57.67	118.02	121.48	37.4	hetero	29

# When you want to remove one column you can write it without 
# c() function, e. g. -speaker

It is possible to reorder columns using select() function:

homo %>%
  select(speaker, age, s.duration.ms)

speaker	age	s.duration.ms
A	30	61.40
B	19	63.90
C	29	55.08
D	36	78.11
E	27	64.71
F	33	67.00
G	28	65.39
H	22	62.46
I	22	60.45
J	40	59.59
K	30	62.94
L	25	53.31
M	20	45.13
N	29	57.67

3.8 arrange (dplyr)

base R

homo[order(homo$orientation, homo$age), ]

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
B	63.90	126.49	100.29	114.0	20	5	0.80	hetero	19
M	45.13	133.74	155.34	100.5	9	16	0.36	hetero	20
H	62.46	120.13	105.26	55.7	21	4	0.84	hetero	22
L	53.31	112.05	146.20	57.8	8	17	0.32	hetero	25
G	65.39	147.52	128.96	118.2	20	5	0.80	hetero	28
N	57.67	118.02	121.48	37.4	4	21	0.16	hetero	29
A	61.40	112.60	119.51	52.5	7	18	0.28	hetero	30
I	60.45	140.44	109.86	96.4	20	5	0.80	homo	22
E	64.71	93.68	130.76	37.4	10	15	0.40	homo	27
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
K	62.94	137.37	119.48	87.6	21	4	0.84	homo	30
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36
J	59.59	121.01	123.90	111.7	8	17	0.32	homo	40

dplyr

homo %>%
  arrange(orientation, desc(age))

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age
A	61.40	112.60	119.51	52.5	7	18	0.28	hetero	30
N	57.67	118.02	121.48	37.4	4	21	0.16	hetero	29
G	65.39	147.52	128.96	118.2	20	5	0.80	hetero	28
L	53.31	112.05	146.20	57.8	8	17	0.32	hetero	25
H	62.46	120.13	105.26	55.7	21	4	0.84	hetero	22
M	45.13	133.74	155.34	100.5	9	16	0.36	hetero	20
B	63.90	126.49	100.29	114.0	20	5	0.80	hetero	19
J	59.59	121.01	123.90	111.7	8	17	0.32	homo	40
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33
K	62.94	137.37	119.48	87.6	21	4	0.84	homo	30
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29
E	64.71	93.68	130.76	37.4	10	15	0.40	homo	27
I	60.45	140.44	109.86	96.4	20	5	0.80	homo	22

3.9 distinct

base R

unique(homo$orientation)

## [1] hetero homo  
## Levels: hetero homo

dplyr

homo %>%
  distinct(orientation, age > 20)

orientation	age > 20
hetero	TRUE
hetero	FALSE
homo	TRUE

base R

unique(homo[c("orientation", "perceived.as.homo")])

orientation	perceived.as.homo
hetero	7
hetero	20
homo	9
homo	15
homo	10
homo	17
hetero	21
homo	20
homo	8
homo	21
hetero	8
hetero	9
hetero	4

dplyr

homo %>%
  distinct(orientation, perceived.as.homo)

perceived.as.homo	orientation
7	hetero
20	hetero
9	homo
15	homo
10	homo
17	homo
21	hetero
20	homo
8	homo
21	homo
8	hetero
9	hetero
4	hetero

3.10 mutate (dplyr)

base R

homo$f0.min <- homo$average.f0.Hz - homo$f0.range.Hz/2
homo$f0.min

##  [1]  93.26  43.29  63.30  97.21 112.06 129.79  69.86  77.41  61.66  68.05
## [11]  75.68 117.30 105.09 102.78

homo$f0.max <- homo$average.f0.Hz + homo$f0.range.Hz/2
homo$f0.max

##  [1] 145.76 157.29 166.50 156.01 149.46 171.79 188.06 133.11 158.06 179.75
## [11] 163.28 175.10 205.59 140.18

dplyr

homo %>%
  mutate( f0.mn = average.f0.Hz - f0.range.Hz/2,
          f0.mx = (average.f0.Hz + f0.range.Hz/2)) -> 
  homo
homo

speaker	s.duration.ms	vowel.duration.ms	average.f0.Hz	f0.range.Hz	perceived.as.homo	perceived.as.hetero	perceived.as.homo.percent	orientation	age	f0.min	f0.max	f0.mn	f0.mx
A	61.40	112.60	119.51	52.5	7	18	0.28	hetero	30	93.26	145.76	93.26	145.76
B	63.90	126.49	100.29	114.0	20	5	0.80	hetero	19	43.29	157.29	43.29	157.29
C	55.08	126.81	114.90	103.2	9	16	0.36	homo	29	63.30	166.50	63.30	166.50
D	78.11	119.17	126.61	58.8	15	10	0.60	homo	36	97.21	156.01	97.21	156.01
E	64.71	93.68	130.76	37.4	10	15	0.40	homo	27	112.06	149.46	112.06	149.46
F	67.00	127.87	150.79	42.0	17	8	0.68	homo	33	129.79	171.79	129.79	171.79
G	65.39	147.52	128.96	118.2	20	5	0.80	hetero	28	69.86	188.06	69.86	188.06
H	62.46	120.13	105.26	55.7	21	4	0.84	hetero	22	77.41	133.11	77.41	133.11
I	60.45	140.44	109.86	96.4	20	5	0.80	homo	22	61.66	158.06	61.66	158.06
J	59.59	121.01	123.90	111.7	8	17	0.32	homo	40	68.05	179.75	68.05	179.75
K	62.94	137.37	119.48	87.6	21	4	0.84	homo	30	75.68	163.28	75.68	163.28
L	53.31	112.05	146.20	57.8	8	17	0.32	hetero	25	117.30	175.10	117.30	175.10
M	45.13	133.74	155.34	100.5	9	16	0.36	hetero	20	105.09	205.59	105.09	205.59
N	57.67	118.02	121.48	37.4	4	21	0.16	hetero	29	102.78	140.18	102.78	140.18

3.11 group_by and summarise (dplyr)

homo %>%
  summarise(min(age), mean(s.duration.ms))

min(age)	mean(s.duration.ms)
19	61.22429

homo %>%
  group_by(orientation) %>% 
  summarise(my_mean = mean(s.duration.ms))

orientation	my_mean
hetero	58.46571
homo	63.98286

homo %>%
  group_by(orientation) %>% 
  summarise(mean(s.duration.ms))

orientation	mean(s.duration.ms)
hetero	58.46571
homo	63.98286

homo %>%
  group_by(orientation) %>% 
  summarise(mean_by_orientation = mean(s.duration.ms))

orientation	mean_by_orientation
hetero	58.46571
homo	63.98286

4.1. tidyr package

Short format

df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short

consonant	initial	intervocal	final
stops	123	57	30
fricatives	87	77	69
affricates	73	82	12
nasals	7	78	104

Long format

consonant	position	number
stops	initial	123
fricatives	initial	87
affricates	initial	73
nasals	initial	7
stops	intervocal	57
fricatives	intervocal	77
affricates	intervocal	82
nasals	intervocal	78
stops	final	30
fricatives	final	69
affricates	final	12
nasals	final	104

4.2 Short format → Long format: gather (tidyr)

df.short <- data.frame(
                   consonant = c("stops", "fricatives", "affricates", "nasals"),
                   initial = c(123, 87, 73, 7),
                   intervocal = c(57, 77, 82, 78),
                   final = c(30, 69, 12, 104))
df.short

consonant	initial	intervocal	final
stops	123	57	30
fricatives	87	77	69
affricates	73	82	12
nasals	7	78	104

df.short %>% 
  gather(position, number, initial:final) ->
  df.long
df.long

consonant	position	number
stops	initial	123
fricatives	initial	87
affricates	initial	73
nasals	initial	7
stops	intervocal	57
fricatives	intervocal	77
affricates	intervocal	82
nasals	intervocal	78
stops	final	30
fricatives	final	69
affricates	final	12
nasals	final	104

4.2 Long format → Short format: spread (tidyr)

df.long %>% 
  spread(position, number) ->
  df.short
df.short

consonant	final	initial	intervocal
affricates	12	73	82
fricatives	69	87	77
nasals	104	7	78
stops	30	123	57

5.1 Anscombe’s quartet

In Anscombe, F. J. (1973). “Graphs in Statistical Analysis” was presented the next sets of data:

quartet <- read.csv("https://goo.gl/KuuzYy")
quartet

x	y	dataset
10	8.04	1
8	6.95	1
13	7.58	1
9	8.81	1
11	8.33	1
14	9.96	1
6	7.24	1
4	4.26	1
12	10.84	1
7	4.82	1
5	5.68	1
10	9.14	2
8	8.14	2
13	8.74	2
9	8.77	2
11	9.26	2
14	8.10	2
6	6.13	2
4	3.10	2
12	9.13	2
7	7.26	2
5	4.74	2
10	7.46	3
8	6.77	3
13	12.74	3
9	7.11	3
11	7.81	3
14	8.84	3
6	6.08	3
4	5.39	3
12	8.15	3
7	6.42	3
5	5.73	3
8	6.58	4
8	5.76	4
8	7.71	4
8	8.84	4
8	8.47	4
8	7.04	4
8	5.25	4
19	12.50	4
8	5.56	4
8	7.91	4
8	6.89	4

quartet %>% 
  group_by(dataset) %>% 
  summarise(mean_X = mean(x),
            mean_Y = mean(y),
            sd_X = sd(x),
            sd_Y = sd(y),
            cor = cor(x, y),
            n_obs = n()) %>% 
  select(-dataset) %>% 
  round(., 2)

mean_X	mean_Y	sd_X	sd_Y	cor	n_obs
9	7.5	3.32	2.03	0.82	11
9	7.5	3.32	2.03	0.82	11
9	7.5	3.32	2.03	0.82	11
9	7.5	3.32	2.03	0.82	11

5.2 Datasaurus

In Matejka and Fitzmaurice (2017) “Same Stats, Different Graphs” was presented the next sets of data:

datasaurus <- read_tsv("https://goo.gl/gtaunr")
head(datasaurus)

dataset	x	y
dino	55.3846	97.1795
dino	51.5385	96.0256
dino	46.1538	94.4872
dino	42.8205	91.4103
dino	40.7692	88.3333
dino	38.7179	84.8718

datasaurus %>% 
  group_by(dataset) %>% 
  summarise(mean_X = mean(x),
            mean_Y = mean(y),
            sd_X = sd(x),
            sd_Y = sd(y),
            cor = cor(x, y),
            n_obs = n()) %>% 
  select(-dataset) %>% 
  round(., 1)

mean_X	mean_Y	sd_X	sd_Y	cor	n_obs
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142
54.3	47.8	16.8	26.9	-0.1	142

6.1 Scaterplot

base R

plot(homo$s.duration.ms, homo$vowel.duration.ms)

ggplot2

ggplot(data = homo, aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()

dplyr, ggplot2

homo %>%
  ggplot(aes(average.f0.Hz, age))+
  geom_smooth(method = "lm")+
  geom_point(aes(color = orientation))

6.1.1 Scaterplot: color

base R

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     col = c("red", "blue")[homo$orientation])

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             color = orientation)) +
  geom_point()

6.1.2 Scaterplot: shape

base R

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     pch = c(16, 17)[homo$orientation])

plot(1:25, pch = 1:25)

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             shape = orientation)) +
  geom_point(color = "green")

6.1.3 Scaterplot: size

base R

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     cex = homo$age/20)

:(

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     cex = homo$age)

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms,
             size = age)) +
  geom_point()

6.1.4 Scaterplot: text

base R

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     pch = c("⚤", "⚣")[homo$orientation])

dplyr, ggplot2

levels(homo$orientation) <- c("⚣", "⚤")
homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms, label = orientation, fill = orientation)) +
  geom_label()

6.1.5 Scaterplot: title

base R

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     main = "length of [s] vs. length of vowels")

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  labs(title = "length of [s] vs. length of vowels",
       subtitle = "based on 14 speakers of Cantonese",
       caption = "data from [Chi kuk 2007]")

6.1.6 Scaterplot: axis

base R

plot(homo$s.duration.ms, homo$vowel.duration.ms,
     xlab = "duration of [s] in ms", ylab = "vowel duration in ms")

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  xlab("duration of [s] in ms")+
  ylab("vowel duration in ms")

6.1.7 Log scales

Lets use the frequency dictionary for Russian

freq <- read.csv("https://goo.gl/TlX7xW", sep = "\t")
freq %>%
  arrange(desc(Freq.ipm.)) %>% 
  slice(1:200) %>% 
  ggplot(aes(Rank, Freq.ipm.)) +
  geom_point() +
  xlab("") +
  ylab("ipm")

base R

plot(1:52138, freq$Freq.ipm.,
  xlab = NA, ylab = "ipm",
  las = 1,
  log = "yx")

dplyr, ggplot2

freq %>%
  ggplot(aes(1:52138, Freq.ipm.))+
  geom_point()+
  xlab("")+
  ylab("ipm")+
  scale_y_log10()

6.1.8 Scaterplot: rug

R base

plot(homo$s.duration.ms, homo$vowel.duration.ms)
rug(homo$s.duration.ms)
rug(homo$vowel.duration.ms, side = 2)

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
  geom_point() +
  geom_rug()

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
  geom_point() +
  geom_rug()

6.1.9 Scaterplot: lines

R base

plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, v = 60)

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = mean(homo$vowel.duration.ms))+
  geom_vline(xintercept = 60)

R base

plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, lty = 2)
abline(v = 60, lwd = 42)

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = 120, linetype = 2)+
  geom_vline(xintercept = 60, size = 5)

R base

plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, lty = 4)
abline(v = 60, col = "blue")

dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point() +
  geom_hline(yintercept = 120, linetype = 4)+
  geom_vline(xintercept = 60, color = "blue")

6.1.10 Scaterplot: annotate

The function annotate adds geoms to a plot.

homo %>%
  ggplot(aes(s.duration.ms, vowel.duration.ms)) +
  geom_point()+
  annotate(geom = "rect", xmin = 77, xmax = 79,
             ymin = 117, ymax = 122, fill = "red", alpha = 0.2) + 
  annotate(geom = "text", x = 78, y = 125,
             label = "Who is that?\n Outlier?")

6.2.1 Barplots: basics

There are two possible situations:

not aggregate data

head(homo[, c(1, 9)])

speaker	orientation
A	homo
B	homo
C	hetero
D	hetero
E	hetero
F	hetero

aggregate data

head(homo[, c(1, 10)])

speaker	age
A	30
B	19
C	29
D	36
E	27
F	33

Not aggregate data

base R

barplot(table(homo$orientation))

dplyr, ggplot2

homo %>%
  ggplot(aes(orientation)) +
  geom_bar()

Aggregate data

base R

barplot(homo$age, names.arg = homo$speaker)

dplyr, ggplot2

homo %>%
  ggplot(aes(speaker, age)) +
  geom_bar(stat = "identity")

6.2.2 Barplots: color

base R

barplot(homo$age, names.arg = homo$speaker,
  col = homo$orientation)

* dplyr, ggplot2

homo %>%
  ggplot(aes(speaker, age, fill = orientation)) +
  geom_bar(stat = "identity")

6.3.1 Boxplots: basics

base R

boxplot(homo$s.duration.ms~homo$orientation)

* dplyr, ggplot2

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot()

6.3.2 Boxplots: points

base R

boxplot(homo$s.duration.ms~homo$orientation)
stripchart(homo$s.duration.ms ~ homo$orientation,
  pch = 1, vertical = T, add = T)

* dplyr, ggplot2

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot()+
  geom_point()

6.3.3 Boxplots: jitter

base R

boxplot(homo$s.duration.ms~homo$orientation)
stripchart(homo$s.duration.ms~homo$orientation,
  pch = 1, vertical = T, add = T, method = "jitter")

* dplyr, ggplot2

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_boxplot() +
  geom_jitter(width = 0.5)

6.3.3 Boxplots: jitter

base R There is a horrible package vioplot
dplyr, ggplot2

homo %>%
  ggplot(aes(orientation, s.duration.ms)) +
  geom_violin() +
  geom_jitter()

6. Preliminary summary: two variables

scaterplot: two quantitative varibles
barplot: nominal varible and one number
boxplot: nominal varible and quantitative varibles
jittered points or sized points: two nominal varibles

mtcars %>% 
  mutate(newvar = mpg > 22,
         newvr = mpg < 17) %>% 
  ggplot(aes(newvr, newvar, color = newvar))+
  geom_jitter(width = 0.2)

mtcars %>% 
  mutate(newvar = mpg > 22,
         newvr = mpg < 17) %>% 
  group_by(newvar, newvr) %>% 
  summarise(number = n()) %>% 
  ggplot(aes(newvr, newvar, label = number))+
  geom_point(aes(size = number, color = newvar))+
  geom_text()+
  scale_size(range = c(10, 30))+
  guides(size = F)

6.6.1 Histogram: basics

base R

hist(homo$s.duration.ms)

* dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How many histogram bins do we need?

[Sturgers 1926] nclass.Sturges(adyghe$F1)
[Scott 1979] nclass.scott(adyghe$F1)
[Freedman, Diaconis 1981] nclass.FD(adyghe$F1)
base R

hist(homo$s.duration.ms,
     breaks = nclass.FD(homo$s.duration.ms))

* dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram(bins = nclass.FD(homo$s.duration.ms))

6.6.2 Histogram: color

base R

hist(homo$s.duration.ms, col = "lightblue")

* dplyr, ggplot2

homo %>%
  ggplot(aes(s.duration.ms)) +
  geom_histogram(fill = "lightblue")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

6.7 Facets

Facetization is the most powerful ggplot2 tool that allow to split up your data by one or more variables and plot the subsets of data together.

`facet_wrap()`

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation)

You can see that there are all speakers on both graph, but only certain speakers have dot value. It is because by default scales of facets are equal.

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation, scales = "free")

It is possible to make only one scale “free”:

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(~orientation, scales = "free_x")

`facet_grid()`

It is possible to facet using more then one variable.

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_wrap(older_then_28~orientation, scales = "free_x")

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_grid(older_then_28~orientation, scales = "free_x")

There is a nice argument margins.

homo %>% 
  mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>% 
  ggplot(aes(speaker, s.duration.ms))+
  geom_point() + 
  facet_grid(older_then_28~orientation, scales = "free_x", margins = TRUE)

Sometimes it is nice to put your data in all facets:

homo %>% 
  ggplot(aes(speaker, s.duration.ms))+
  # Add an additional geom without facetization variable!
  geom_point(data = homo[,-9], aes(speaker, s.duration.ms), color = "grey") + 
  geom_point() + 
  facet_wrap(~orientation)