02.02.2017
c(42, 99, 43)## [1] 42 99 43
matrix(1:12, nrow=3,ncol=4)## [,1] [,2] [,3] [,4]
## [1,] 1 4 7 10
## [2,] 2 5 8 11
## [3,] 3 6 9 12
list(
n_lectures = 12,
l_topics = c("begining", "data manipulation", "descriptive stats"),
h_topics = c("data manipulations", "descriptive stats")
)## $n_lectures
## [1] 12
##
## $l_topics
## [1] "begining" "data manipulation" "descriptive stats"
##
## $h_topics
## [1] "data manipulations" "descriptive stats"
data.frame(
names = c("Olya", "Ilya", "Sasha", "George"),
lecturer = c(TRUE, TRUE, FALSE, TRUE),
lecturer_experience = c(19, 6, 0, 3)
)| names | lecturer | lecturer_experience |
|---|---|---|
| Olya | TRUE | 19 |
| Ilya | TRUE | 6 |
| Sasha | FALSE | 0 |
| George | TRUE | 3 |
There are some embedded data frames (e. g. mtcars, cars, iris). How many rows and columns?
nrow(iris) # returns the number of rows## [1] 150
ncol(mtcars) # returns the number of columns## [1] 11
head(cars) # returns the first 6 rows| speed | dist |
|---|---|
| 4 | 2 |
| 4 | 10 |
| 7 | 4 |
| 7 | 22 |
| 8 | 16 |
| 9 | 10 |
head(cars, 4) # returns the first 4 rows| speed | dist |
|---|---|
| 4 | 2 |
| 4 | 10 |
| 7 | 4 |
| 7 | 22 |
tail(cars) # returns the last 6 rows| speed | dist | |
|---|---|---|
| 45 | 23 | 54 |
| 46 | 24 | 70 |
| 47 | 24 | 92 |
| 48 | 24 | 93 |
| 49 | 24 | 120 |
| 50 | 25 | 85 |
summary(cars) # produce some stats## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
str(cars) # shows the structure: variables, their type## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
#View(cars)mtcars$mpg # shows the mpg vector## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2
## [15] 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4
## [29] 15.8 19.7 15.0 21.4
mtcars[3,7] # shows the 3. row, 7. column## [1] 18.61
mtcars[3,] # shows the 3. row| mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Datsun 710 | 22.8 | 4 | 108 | 93 | 3.85 | 2.32 | 18.61 | 1 | 1 | 4 | 1 |
mtcars[,7] # shows the 7. column## [1] 16.46 17.02 18.61 19.44 17.02 20.22 15.84 20.00 22.90 18.30 18.90
## [12] 17.40 17.60 18.00 17.98 17.82 17.42 19.47 18.52 19.90 20.01 16.87
## [23] 17.30 15.41 17.05 18.90 16.70 16.90 14.50 15.50 14.60 18.60
mtcars[mtcars$mpg < 20, ] # show all rows with the mpg value lower then 20| mpg | cyl | disp | hp | drat | wt | qsec | vs | am | gear | carb | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Hornet Sportabout | 18.7 | 8 | 360.0 | 175 | 3.15 | 3.440 | 17.02 | 0 | 0 | 3 | 2 |
| Valiant | 18.1 | 6 | 225.0 | 105 | 2.76 | 3.460 | 20.22 | 1 | 0 | 3 | 1 |
| Duster 360 | 14.3 | 8 | 360.0 | 245 | 3.21 | 3.570 | 15.84 | 0 | 0 | 3 | 4 |
| Merc 280 | 19.2 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.30 | 1 | 0 | 4 | 4 |
| Merc 280C | 17.8 | 6 | 167.6 | 123 | 3.92 | 3.440 | 18.90 | 1 | 0 | 4 | 4 |
| Merc 450SE | 16.4 | 8 | 275.8 | 180 | 3.07 | 4.070 | 17.40 | 0 | 0 | 3 | 3 |
| Merc 450SL | 17.3 | 8 | 275.8 | 180 | 3.07 | 3.730 | 17.60 | 0 | 0 | 3 | 3 |
| Merc 450SLC | 15.2 | 8 | 275.8 | 180 | 3.07 | 3.780 | 18.00 | 0 | 0 | 3 | 3 |
| Cadillac Fleetwood | 10.4 | 8 | 472.0 | 205 | 2.93 | 5.250 | 17.98 | 0 | 0 | 3 | 4 |
| Lincoln Continental | 10.4 | 8 | 460.0 | 215 | 3.00 | 5.424 | 17.82 | 0 | 0 | 3 | 4 |
| Chrysler Imperial | 14.7 | 8 | 440.0 | 230 | 3.23 | 5.345 | 17.42 | 0 | 0 | 3 | 4 |
| Dodge Challenger | 15.5 | 8 | 318.0 | 150 | 2.76 | 3.520 | 16.87 | 0 | 0 | 3 | 2 |
| AMC Javelin | 15.2 | 8 | 304.0 | 150 | 3.15 | 3.435 | 17.30 | 0 | 0 | 3 | 2 |
| Camaro Z28 | 13.3 | 8 | 350.0 | 245 | 3.73 | 3.840 | 15.41 | 0 | 0 | 3 | 4 |
| Pontiac Firebird | 19.2 | 8 | 400.0 | 175 | 3.08 | 3.845 | 17.05 | 0 | 0 | 3 | 2 |
| Ford Pantera L | 15.8 | 8 | 351.0 | 264 | 4.22 | 3.170 | 14.50 | 0 | 1 | 5 | 4 |
| Ferrari Dino | 19.7 | 6 | 145.0 | 175 | 3.62 | 2.770 | 15.50 | 0 | 1 | 5 | 6 |
| Maserati Bora | 15.0 | 8 | 301.0 | 335 | 3.54 | 3.570 | 14.60 | 0 | 1 | 5 | 8 |
# ?mtcarsThe tidyverse is a set of packages:
Install tidyverse package using install.packages(“tidyverse”)
Load tidyverse package
library(tidyverse)In this presentation the folowing version of tideverse, dplyr and ggplot2 are used:
packageVersion("tidyverse")## [1] '1.1.1'
packageVersion("dplyr")## [1] '0.7.2'
packageVersion("ggplot2")## [1] '2.2.1'
my_data_1 <- mtcars[5:20,] # select a subset of mtcars
my_data_2 <- mtcars[17:29,] # select a subset of mtcars
combine_rows <- rbind.data.frame(my_data_1, my_data_2)
nrow(my_data_1); nrow(my_data_2); nrow(combine_rows)## [1] 16
## [1] 13
## [1] 29
my_data_3 <- mtcars[,3:7] # select a subset of mtcars
my_data_4 <- mtcars[,6:11] # select a subset of mtcars
combine_cols <- cbind.data.frame(my_data_3, my_data_4)
ncol(my_data_3); ncol(my_data_4); ncol(combine_cols)## [1] 5
## [1] 6
## [1] 11
languages <- data.frame(
languages = c("Selkup", "French", "Chukchi", "Kashubian"),
countries = c("Russia", "France", "Russia", "Poland"),
iso = c("sel", "fra", "ckt", "pol")
)
languages| languages | countries | iso |
|---|---|---|
| Selkup | Russia | sel |
| French | France | fra |
| Chukchi | Russia | ckt |
| Kashubian | Poland | pol |
country_population <- data.frame(
countries = c("Russia", "Poland", "Finland"),
population_mln = c(143, 38, 5))
country_population| countries | population_mln |
|---|---|
| Russia | 143 |
| Poland | 38 |
| Finland | 5 |
inner_join(languages, country_population)## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
| languages | countries | iso | population_mln |
|---|---|---|---|
| Selkup | Russia | sel | 143 |
| Chukchi | Russia | ckt | 143 |
| Kashubian | Poland | pol | 38 |
left_join(languages, country_population)## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
| languages | countries | iso | population_mln |
|---|---|---|---|
| Selkup | Russia | sel | 143 |
| French | France | fra | NA |
| Chukchi | Russia | ckt | 143 |
| Kashubian | Poland | pol | 38 |
right_join(languages, country_population)## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
| languages | countries | iso | population_mln |
|---|---|---|---|
| Selkup | Russia | sel | 143 |
| Chukchi | Russia | ckt | 143 |
| Kashubian | Poland | pol | 38 |
| NA | Finland | NA | 5 |
anti_join(languages, country_population)## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
| languages | countries | iso |
|---|---|---|
| French | France | fra |
anti_join(country_population, languages)## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
| countries | population_mln |
|---|---|
| Finland | 5 |
full_join(country_population, languages)## Joining, by = "countries"
## Warning: Column `countries` joining factors with different levels, coercing
## to character vector
| countries | population_mln | languages | iso |
|---|---|---|---|
| Russia | 143 | Selkup | sel |
| Russia | 143 | Chukchi | ckt |
| Poland | 38 | Kashubian | pol |
| Finland | 5 | NA | NA |
| France | NA | French | fra |
The majority of examples in that presentation are based on Chi-kuk 2007. Experiment consisted of a perception and judgment test aimed at measuring the correlation between acoustic cues and perceived sexual orientation. Naïve Cantonese speakers were asked to listen to the Cantonese speech samples collected in Experiment and judge whether the speakers were gay or heterosexual. There are 14 speakers and following parameters:
Download data
homo <- read.csv("http://goo.gl/Zjr9aF")
homo| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | 0.28 | hetero | 30 |
| B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | 0.80 | hetero | 19 |
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 |
| E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | 0.40 | homo | 27 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 |
| G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | 0.80 | hetero | 28 |
| H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | 0.84 | hetero | 22 |
| I | 60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | 0.80 | homo | 22 |
| J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | 0.32 | homo | 40 |
| K | 62.94 | 137.37 | 119.48 | 87.6 | 21 | 4 | 0.84 | homo | 30 |
| L | 53.31 | 112.05 | 146.20 | 57.8 | 8 | 17 | 0.32 | hetero | 25 |
| M | 45.13 | 133.74 | 155.34 | 100.5 | 9 | 16 | 0.36 | hetero | 20 |
| N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | 0.16 | hetero | 29 |
Tibble is a useful modification of Data Frame.
library(tidyverse)
homo <- tbl_df(homo)
homo| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | 0.28 | hetero | 30 |
| B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | 0.80 | hetero | 19 |
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 |
| E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | 0.40 | homo | 27 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 |
| G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | 0.80 | hetero | 28 |
| H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | 0.84 | hetero | 22 |
| I | 60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | 0.80 | homo | 22 |
| J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | 0.32 | homo | 40 |
| K | 62.94 | 137.37 | 119.48 | 87.6 | 21 | 4 | 0.84 | homo | 30 |
| L | 53.31 | 112.05 | 146.20 | 57.8 | 8 | 17 | 0.32 | hetero | 25 |
| M | 45.13 | 133.74 | 155.34 | 100.5 | 9 | 16 | 0.36 | hetero | 20 |
| N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | 0.16 | hetero | 29 |
How many speakers are older than 28?
homo[homo$age > 28,]| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | 0.28 | hetero | 30 |
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 |
| J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | 0.32 | homo | 40 |
| K | 62.94 | 137.37 | 119.48 | 87.6 | 21 | 4 | 0.84 | homo | 30 |
| N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | 0.16 | hetero | 29 |
homo %>%
filter(age > 28, s.duration.ms < 60)| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | 0.32 | homo | 40 |
| N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | 0.16 | hetero | 29 |
%>% is called pipe. Pipe is a technique for passing result of the work of one function to another.
sort(sqrt(abs(sin(1:22))), decreasing = TRUE)## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
1:22 %>%
sin() %>%
abs() %>%
sqrt() %>%
sort(., decreasing = TRUE) # dot here shows where should argument be## [1] 0.9999951 0.9952926 0.9946649 0.9805088 0.9792468 0.9554817 0.9535709
## [8] 0.9173173 0.9146888 0.8699440 0.8665952 0.8105471 0.8064043 0.7375779
## [15] 0.7325114 0.6482029 0.6419646 0.5365662 0.5285977 0.3871398 0.3756594
## [22] 0.0940814
Pipes in tidyverse package came from magritr package. Sometimes it works incorrectly with not tidyverse functions.
homo[3:7, ]| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 |
| E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | 0.40 | homo | 27 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 |
| G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | 0.80 | hetero | 28 |
homo %>%
slice(3:7)| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 |
| E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | 0.40 | homo | 27 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 |
| G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | 0.80 | hetero | 28 |
homo[, 8:10]| perceived.as.homo.percent | orientation | age |
|---|---|---|
| 0.28 | hetero | 30 |
| 0.80 | hetero | 19 |
| 0.36 | homo | 29 |
| 0.60 | homo | 36 |
| 0.40 | homo | 27 |
| 0.68 | homo | 33 |
| 0.80 | hetero | 28 |
| 0.84 | hetero | 22 |
| 0.80 | homo | 22 |
| 0.32 | homo | 40 |
| 0.84 | homo | 30 |
| 0.32 | hetero | 25 |
| 0.36 | hetero | 20 |
| 0.16 | hetero | 29 |
homo %>%
select(8:10)| perceived.as.homo.percent | orientation | age |
|---|---|---|
| 0.28 | hetero | 30 |
| 0.80 | hetero | 19 |
| 0.36 | homo | 29 |
| 0.60 | homo | 36 |
| 0.40 | homo | 27 |
| 0.68 | homo | 33 |
| 0.80 | hetero | 28 |
| 0.84 | hetero | 22 |
| 0.80 | homo | 22 |
| 0.32 | homo | 40 |
| 0.84 | homo | 30 |
| 0.32 | hetero | 25 |
| 0.36 | hetero | 20 |
| 0.16 | hetero | 29 |
homo %>%
select(speaker:average.f0.Hz)| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz |
|---|---|---|---|
| A | 61.40 | 112.60 | 119.51 |
| B | 63.90 | 126.49 | 100.29 |
| C | 55.08 | 126.81 | 114.90 |
| D | 78.11 | 119.17 | 126.61 |
| E | 64.71 | 93.68 | 130.76 |
| F | 67.00 | 127.87 | 150.79 |
| G | 65.39 | 147.52 | 128.96 |
| H | 62.46 | 120.13 | 105.26 |
| I | 60.45 | 140.44 | 109.86 |
| J | 59.59 | 121.01 | 123.90 |
| K | 62.94 | 137.37 | 119.48 |
| L | 53.31 | 112.05 | 146.20 |
| M | 45.13 | 133.74 | 155.34 |
| N | 57.67 | 118.02 | 121.48 |
It is possible to use select() function to remove columns:
homo %>%
select(-c(speaker, perceived.as.hetero, perceived.as.homo, perceived.as.homo.percent))| s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | orientation | age |
|---|---|---|---|---|---|
| 61.40 | 112.60 | 119.51 | 52.5 | hetero | 30 |
| 63.90 | 126.49 | 100.29 | 114.0 | hetero | 19 |
| 55.08 | 126.81 | 114.90 | 103.2 | homo | 29 |
| 78.11 | 119.17 | 126.61 | 58.8 | homo | 36 |
| 64.71 | 93.68 | 130.76 | 37.4 | homo | 27 |
| 67.00 | 127.87 | 150.79 | 42.0 | homo | 33 |
| 65.39 | 147.52 | 128.96 | 118.2 | hetero | 28 |
| 62.46 | 120.13 | 105.26 | 55.7 | hetero | 22 |
| 60.45 | 140.44 | 109.86 | 96.4 | homo | 22 |
| 59.59 | 121.01 | 123.90 | 111.7 | homo | 40 |
| 62.94 | 137.37 | 119.48 | 87.6 | homo | 30 |
| 53.31 | 112.05 | 146.20 | 57.8 | hetero | 25 |
| 45.13 | 133.74 | 155.34 | 100.5 | hetero | 20 |
| 57.67 | 118.02 | 121.48 | 37.4 | hetero | 29 |
# When you want to remove one column you can write it without
# c() function, e. g. -speakerIt is possible to reorder columns using select() function:
homo %>%
select(speaker, age, s.duration.ms)| speaker | age | s.duration.ms |
|---|---|---|
| A | 30 | 61.40 |
| B | 19 | 63.90 |
| C | 29 | 55.08 |
| D | 36 | 78.11 |
| E | 27 | 64.71 |
| F | 33 | 67.00 |
| G | 28 | 65.39 |
| H | 22 | 62.46 |
| I | 22 | 60.45 |
| J | 40 | 59.59 |
| K | 30 | 62.94 |
| L | 25 | 53.31 |
| M | 20 | 45.13 |
| N | 29 | 57.67 |
homo[order(homo$orientation, homo$age), ]| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | 0.80 | hetero | 19 |
| M | 45.13 | 133.74 | 155.34 | 100.5 | 9 | 16 | 0.36 | hetero | 20 |
| H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | 0.84 | hetero | 22 |
| L | 53.31 | 112.05 | 146.20 | 57.8 | 8 | 17 | 0.32 | hetero | 25 |
| G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | 0.80 | hetero | 28 |
| N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | 0.16 | hetero | 29 |
| A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | 0.28 | hetero | 30 |
| I | 60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | 0.80 | homo | 22 |
| E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | 0.40 | homo | 27 |
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| K | 62.94 | 137.37 | 119.48 | 87.6 | 21 | 4 | 0.84 | homo | 30 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 |
| J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | 0.32 | homo | 40 |
homo %>%
arrange(orientation, desc(age))| speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age |
|---|---|---|---|---|---|---|---|---|---|
| A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | 0.28 | hetero | 30 |
| N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | 0.16 | hetero | 29 |
| G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | 0.80 | hetero | 28 |
| L | 53.31 | 112.05 | 146.20 | 57.8 | 8 | 17 | 0.32 | hetero | 25 |
| H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | 0.84 | hetero | 22 |
| M | 45.13 | 133.74 | 155.34 | 100.5 | 9 | 16 | 0.36 | hetero | 20 |
| B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | 0.80 | hetero | 19 |
| J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | 0.32 | homo | 40 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 |
| K | 62.94 | 137.37 | 119.48 | 87.6 | 21 | 4 | 0.84 | homo | 30 |
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 |
| E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | 0.40 | homo | 27 |
| I | 60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | 0.80 | homo | 22 |
unique(homo$orientation)## [1] hetero homo
## Levels: hetero homo
homo %>%
distinct(orientation, age > 20)| orientation | age > 20 |
|---|---|
| hetero | TRUE |
| hetero | FALSE |
| homo | TRUE |
unique(homo[c("orientation", "perceived.as.homo")])| orientation | perceived.as.homo |
|---|---|
| hetero | 7 |
| hetero | 20 |
| homo | 9 |
| homo | 15 |
| homo | 10 |
| homo | 17 |
| hetero | 21 |
| homo | 20 |
| homo | 8 |
| homo | 21 |
| hetero | 8 |
| hetero | 9 |
| hetero | 4 |
homo %>%
distinct(orientation, perceived.as.homo)| perceived.as.homo | orientation |
|---|---|
| 7 | hetero |
| 20 | hetero |
| 9 | homo |
| 15 | homo |
| 10 | homo |
| 17 | homo |
| 21 | hetero |
| 20 | homo |
| 8 | homo |
| 21 | homo |
| 8 | hetero |
| 9 | hetero |
| 4 | hetero |
homo$f0.min <- homo$average.f0.Hz - homo$f0.range.Hz/2
homo$f0.min## [1] 93.26 43.29 63.30 97.21 112.06 129.79 69.86 77.41 61.66 68.05
## [11] 75.68 117.30 105.09 102.78
homo$f0.max <- homo$average.f0.Hz + homo$f0.range.Hz/2
homo$f0.max## [1] 145.76 157.29 166.50 156.01 149.46 171.79 188.06 133.11 158.06 179.75
## [11] 163.28 175.10 205.59 140.18
homo %>%
mutate( f0.mn = average.f0.Hz - f0.range.Hz/2,
f0.mx = (average.f0.Hz + f0.range.Hz/2)) ->
homo
homo | speaker | s.duration.ms | vowel.duration.ms | average.f0.Hz | f0.range.Hz | perceived.as.homo | perceived.as.hetero | perceived.as.homo.percent | orientation | age | f0.min | f0.max | f0.mn | f0.mx |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| A | 61.40 | 112.60 | 119.51 | 52.5 | 7 | 18 | 0.28 | hetero | 30 | 93.26 | 145.76 | 93.26 | 145.76 |
| B | 63.90 | 126.49 | 100.29 | 114.0 | 20 | 5 | 0.80 | hetero | 19 | 43.29 | 157.29 | 43.29 | 157.29 |
| C | 55.08 | 126.81 | 114.90 | 103.2 | 9 | 16 | 0.36 | homo | 29 | 63.30 | 166.50 | 63.30 | 166.50 |
| D | 78.11 | 119.17 | 126.61 | 58.8 | 15 | 10 | 0.60 | homo | 36 | 97.21 | 156.01 | 97.21 | 156.01 |
| E | 64.71 | 93.68 | 130.76 | 37.4 | 10 | 15 | 0.40 | homo | 27 | 112.06 | 149.46 | 112.06 | 149.46 |
| F | 67.00 | 127.87 | 150.79 | 42.0 | 17 | 8 | 0.68 | homo | 33 | 129.79 | 171.79 | 129.79 | 171.79 |
| G | 65.39 | 147.52 | 128.96 | 118.2 | 20 | 5 | 0.80 | hetero | 28 | 69.86 | 188.06 | 69.86 | 188.06 |
| H | 62.46 | 120.13 | 105.26 | 55.7 | 21 | 4 | 0.84 | hetero | 22 | 77.41 | 133.11 | 77.41 | 133.11 |
| I | 60.45 | 140.44 | 109.86 | 96.4 | 20 | 5 | 0.80 | homo | 22 | 61.66 | 158.06 | 61.66 | 158.06 |
| J | 59.59 | 121.01 | 123.90 | 111.7 | 8 | 17 | 0.32 | homo | 40 | 68.05 | 179.75 | 68.05 | 179.75 |
| K | 62.94 | 137.37 | 119.48 | 87.6 | 21 | 4 | 0.84 | homo | 30 | 75.68 | 163.28 | 75.68 | 163.28 |
| L | 53.31 | 112.05 | 146.20 | 57.8 | 8 | 17 | 0.32 | hetero | 25 | 117.30 | 175.10 | 117.30 | 175.10 |
| M | 45.13 | 133.74 | 155.34 | 100.5 | 9 | 16 | 0.36 | hetero | 20 | 105.09 | 205.59 | 105.09 | 205.59 |
| N | 57.67 | 118.02 | 121.48 | 37.4 | 4 | 21 | 0.16 | hetero | 29 | 102.78 | 140.18 | 102.78 | 140.18 |
homo %>%
summarise(min(age), mean(s.duration.ms))| min(age) | mean(s.duration.ms) |
|---|---|
| 19 | 61.22429 |
homo %>%
group_by(orientation) %>%
summarise(my_mean = mean(s.duration.ms))| orientation | my_mean |
|---|---|
| hetero | 58.46571 |
| homo | 63.98286 |
homo %>%
group_by(orientation) %>%
summarise(mean(s.duration.ms))| orientation | mean(s.duration.ms) |
|---|---|
| hetero | 58.46571 |
| homo | 63.98286 |
homo %>%
group_by(orientation) %>%
summarise(mean_by_orientation = mean(s.duration.ms))| orientation | mean_by_orientation |
|---|---|
| hetero | 58.46571 |
| homo | 63.98286 |
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short| consonant | initial | intervocal | final |
|---|---|---|---|
| stops | 123 | 57 | 30 |
| fricatives | 87 | 77 | 69 |
| affricates | 73 | 82 | 12 |
| nasals | 7 | 78 | 104 |
| consonant | position | number |
|---|---|---|
| stops | initial | 123 |
| fricatives | initial | 87 |
| affricates | initial | 73 |
| nasals | initial | 7 |
| stops | intervocal | 57 |
| fricatives | intervocal | 77 |
| affricates | intervocal | 82 |
| nasals | intervocal | 78 |
| stops | final | 30 |
| fricatives | final | 69 |
| affricates | final | 12 |
| nasals | final | 104 |
df.short <- data.frame(
consonant = c("stops", "fricatives", "affricates", "nasals"),
initial = c(123, 87, 73, 7),
intervocal = c(57, 77, 82, 78),
final = c(30, 69, 12, 104))
df.short| consonant | initial | intervocal | final |
|---|---|---|---|
| stops | 123 | 57 | 30 |
| fricatives | 87 | 77 | 69 |
| affricates | 73 | 82 | 12 |
| nasals | 7 | 78 | 104 |
df.short %>%
gather(position, number, initial:final) ->
df.long
df.long| consonant | position | number |
|---|---|---|
| stops | initial | 123 |
| fricatives | initial | 87 |
| affricates | initial | 73 |
| nasals | initial | 7 |
| stops | intervocal | 57 |
| fricatives | intervocal | 77 |
| affricates | intervocal | 82 |
| nasals | intervocal | 78 |
| stops | final | 30 |
| fricatives | final | 69 |
| affricates | final | 12 |
| nasals | final | 104 |
df.long %>%
spread(position, number) ->
df.short
df.short| consonant | final | initial | intervocal |
|---|---|---|---|
| affricates | 12 | 73 | 82 |
| fricatives | 69 | 87 | 77 |
| nasals | 104 | 7 | 78 |
| stops | 30 | 123 | 57 |
In Anscombe, F. J. (1973). “Graphs in Statistical Analysis” was presented the next sets of data:
quartet <- read.csv("https://goo.gl/KuuzYy")
quartet| x | y | dataset |
|---|---|---|
| 10 | 8.04 | 1 |
| 8 | 6.95 | 1 |
| 13 | 7.58 | 1 |
| 9 | 8.81 | 1 |
| 11 | 8.33 | 1 |
| 14 | 9.96 | 1 |
| 6 | 7.24 | 1 |
| 4 | 4.26 | 1 |
| 12 | 10.84 | 1 |
| 7 | 4.82 | 1 |
| 5 | 5.68 | 1 |
| 10 | 9.14 | 2 |
| 8 | 8.14 | 2 |
| 13 | 8.74 | 2 |
| 9 | 8.77 | 2 |
| 11 | 9.26 | 2 |
| 14 | 8.10 | 2 |
| 6 | 6.13 | 2 |
| 4 | 3.10 | 2 |
| 12 | 9.13 | 2 |
| 7 | 7.26 | 2 |
| 5 | 4.74 | 2 |
| 10 | 7.46 | 3 |
| 8 | 6.77 | 3 |
| 13 | 12.74 | 3 |
| 9 | 7.11 | 3 |
| 11 | 7.81 | 3 |
| 14 | 8.84 | 3 |
| 6 | 6.08 | 3 |
| 4 | 5.39 | 3 |
| 12 | 8.15 | 3 |
| 7 | 6.42 | 3 |
| 5 | 5.73 | 3 |
| 8 | 6.58 | 4 |
| 8 | 5.76 | 4 |
| 8 | 7.71 | 4 |
| 8 | 8.84 | 4 |
| 8 | 8.47 | 4 |
| 8 | 7.04 | 4 |
| 8 | 5.25 | 4 |
| 19 | 12.50 | 4 |
| 8 | 5.56 | 4 |
| 8 | 7.91 | 4 |
| 8 | 6.89 | 4 |
quartet %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 2)| mean_X | mean_Y | sd_X | sd_Y | cor | n_obs |
|---|---|---|---|---|---|
| 9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |
| 9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |
| 9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |
| 9 | 7.5 | 3.32 | 2.03 | 0.82 | 11 |

In Matejka and Fitzmaurice (2017) “Same Stats, Different Graphs” was presented the next sets of data:
datasaurus <- read_tsv("https://goo.gl/gtaunr")
head(datasaurus)| dataset | x | y |
|---|---|---|
| dino | 55.3846 | 97.1795 |
| dino | 51.5385 | 96.0256 |
| dino | 46.1538 | 94.4872 |
| dino | 42.8205 | 91.4103 |
| dino | 40.7692 | 88.3333 |
| dino | 38.7179 | 84.8718 |

datasaurus %>%
group_by(dataset) %>%
summarise(mean_X = mean(x),
mean_Y = mean(y),
sd_X = sd(x),
sd_Y = sd(y),
cor = cor(x, y),
n_obs = n()) %>%
select(-dataset) %>%
round(., 1)| mean_X | mean_Y | sd_X | sd_Y | cor | n_obs |
|---|---|---|---|---|---|
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
| 54.3 | 47.8 | 16.8 | 26.9 | -0.1 | 142 |
plot(homo$s.duration.ms, homo$vowel.duration.ms)
ggplot(data = homo, aes(s.duration.ms, vowel.duration.ms)) +
geom_point()
homo %>%
ggplot(aes(average.f0.Hz, age))+
geom_smooth(method = "lm")+
geom_point(aes(color = orientation))
plot(homo$s.duration.ms, homo$vowel.duration.ms,
col = c("red", "blue")[homo$orientation])
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
color = orientation)) +
geom_point()
plot(homo$s.duration.ms, homo$vowel.duration.ms,
pch = c(16, 17)[homo$orientation])
plot(1:25, pch = 1:25)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
shape = orientation)) +
geom_point(color = "green")
plot(homo$s.duration.ms, homo$vowel.duration.ms,
cex = homo$age/20)
plot(homo$s.duration.ms, homo$vowel.duration.ms,
cex = homo$age)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms,
size = age)) +
geom_point()
plot(homo$s.duration.ms, homo$vowel.duration.ms,
pch = c("⚤", "⚣")[homo$orientation])
levels(homo$orientation) <- c("⚣", "⚤")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, label = orientation, fill = orientation)) +
geom_label()
plot(homo$s.duration.ms, homo$vowel.duration.ms,
main = "length of [s] vs. length of vowels")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
labs(title = "length of [s] vs. length of vowels",
subtitle = "based on 14 speakers of Cantonese",
caption = "data from [Chi kuk 2007]")
plot(homo$s.duration.ms, homo$vowel.duration.ms,
xlab = "duration of [s] in ms", ylab = "vowel duration in ms")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
xlab("duration of [s] in ms")+
ylab("vowel duration in ms")
Lets use the frequency dictionary for Russian
freq <- read.csv("https://goo.gl/TlX7xW", sep = "\t")
freq %>%
arrange(desc(Freq.ipm.)) %>%
slice(1:200) %>%
ggplot(aes(Rank, Freq.ipm.)) +
geom_point() +
xlab("") +
ylab("ipm")
plot(1:52138, freq$Freq.ipm.,
xlab = NA, ylab = "ipm",
las = 1,
log = "yx")
freq %>%
ggplot(aes(1:52138, Freq.ipm.))+
geom_point()+
xlab("")+
ylab("ipm")+
scale_y_log10()
plot(homo$s.duration.ms, homo$vowel.duration.ms)
rug(homo$s.duration.ms)
rug(homo$vowel.duration.ms, side = 2)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
geom_point() +
geom_rug()
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms, color = orientation)) +
geom_point() +
geom_rug()
plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, v = 60)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = mean(homo$vowel.duration.ms))+
geom_vline(xintercept = 60)
plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, lty = 2)
abline(v = 60, lwd = 42)
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 2)+
geom_vline(xintercept = 60, size = 5)
plot(homo$s.duration.ms, homo$vowel.duration.ms)
abline(h = 120, lty = 4)
abline(v = 60, col = "blue")
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point() +
geom_hline(yintercept = 120, linetype = 4)+
geom_vline(xintercept = 60, color = "blue")
The function annotate adds geoms to a plot.
homo %>%
ggplot(aes(s.duration.ms, vowel.duration.ms)) +
geom_point()+
annotate(geom = "rect", xmin = 77, xmax = 79,
ymin = 117, ymax = 122, fill = "red", alpha = 0.2) +
annotate(geom = "text", x = 78, y = 125,
label = "Who is that?\n Outlier?")
There are two possible situations:
head(homo[, c(1, 9)])| speaker | orientation |
|---|---|
| A | homo |
| B | homo |
| C | hetero |
| D | hetero |
| E | hetero |
| F | hetero |
head(homo[, c(1, 10)])| speaker | age |
|---|---|
| A | 30 |
| B | 19 |
| C | 29 |
| D | 36 |
| E | 27 |
| F | 33 |
barplot(table(homo$orientation))
homo %>%
ggplot(aes(orientation)) +
geom_bar()
barplot(homo$age, names.arg = homo$speaker)
homo %>%
ggplot(aes(speaker, age)) +
geom_bar(stat = "identity")
barplot(homo$age, names.arg = homo$speaker,
col = homo$orientation)
* dplyr, ggplot2
homo %>%
ggplot(aes(speaker, age, fill = orientation)) +
geom_bar(stat = "identity")
boxplot(homo$s.duration.ms~homo$orientation)
* dplyr, ggplot2
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()
boxplot(homo$s.duration.ms~homo$orientation)
stripchart(homo$s.duration.ms ~ homo$orientation,
pch = 1, vertical = T, add = T)
* dplyr, ggplot2
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot()+
geom_point()
boxplot(homo$s.duration.ms~homo$orientation)
stripchart(homo$s.duration.ms~homo$orientation,
pch = 1, vertical = T, add = T, method = "jitter")
* dplyr, ggplot2
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_boxplot() +
geom_jitter(width = 0.5)
homo %>%
ggplot(aes(orientation, s.duration.ms)) +
geom_violin() +
geom_jitter()
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
ggplot(aes(newvr, newvar, color = newvar))+
geom_jitter(width = 0.2)
mtcars %>%
mutate(newvar = mpg > 22,
newvr = mpg < 17) %>%
group_by(newvar, newvr) %>%
summarise(number = n()) %>%
ggplot(aes(newvr, newvar, label = number))+
geom_point(aes(size = number, color = newvar))+
geom_text()+
scale_size(range = c(10, 30))+
guides(size = F)
hist(homo$s.duration.ms)
* dplyr, ggplot2
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram()## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
How many histogram bins do we need?
[Freedman, Diaconis 1981] nclass.FD(adyghe$F1)
base R
hist(homo$s.duration.ms,
breaks = nclass.FD(homo$s.duration.ms))
* dplyr, ggplot2
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(bins = nclass.FD(homo$s.duration.ms))
hist(homo$s.duration.ms, col = "lightblue")
* dplyr, ggplot2
homo %>%
ggplot(aes(s.duration.ms)) +
geom_histogram(fill = "lightblue")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Facetization is the most powerful ggplot2 tool that allow to split up your data by one or more variables and plot the subsets of data together.
facet_wrap()homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation)
You can see that there are all speakers on both graph, but only certain speakers have dot value. It is because by default scales of facets are equal.
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free")
It is possible to make only one scale “free”:
homo %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(~orientation, scales = "free_x")
facet_grid()It is possible to facet using more then one variable.
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_wrap(older_then_28~orientation, scales = "free_x")
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x")
There is a nice argument margins.
homo %>%
mutate(older_then_28 = ifelse(age > 28, "older", "younger")) %>%
ggplot(aes(speaker, s.duration.ms))+
geom_point() +
facet_grid(older_then_28~orientation, scales = "free_x", margins = TRUE)
Sometimes it is nice to put your data in all facets:
homo %>%
ggplot(aes(speaker, s.duration.ms))+
# Add an additional geom without facetization variable!
geom_point(data = homo[,-9], aes(speaker, s.duration.ms), color = "grey") +
geom_point() +
facet_wrap(~orientation)