Логистическая (logit, logistic) и мультиномиальная (multinomial) регрессия применяются в случаях, когда зависимая переменная является категориальной:
library(tidyverse)
library(pscl)
library(nnet)
В датасет собрано 19 языков, со следующими переменными:
ej_n_cons <- read.csv("https://goo.gl/DsRMve")
ej_n_cons %>%
ggplot(aes(ejectives, consonants, fill = ejectives, label = language))+
geom_boxplot(show.legend = FALSE)+
geom_jitter() +
theme_bw() ->
ej_n_cons_plot
plotly::ggplotly(ej_n_cons_plot, tooltip = c("label"))
Данные взяты из исследования [Endresen, Janda 2015], посвященное исследованию маргинальных глаголов изменения состояния в русском языке. Испытуемые (70 школьников, 51 взрослый) оценивали по шкале Ликерта (1…5) приемлемость глаголов с приставками о- и у-:
marginal_verbs <- read_csv("https://goo.gl/6Phok3")
## Parsed with column specification:
## cols(
## Gender = col_character(),
## Age = col_double(),
## AgeGroup = col_character(),
## Education = col_character(),
## City = col_character(),
## SubjectCode = col_character(),
## Score = col_character(),
## GivenScore = col_double(),
## Stimulus = col_character(),
## Prefix = col_character(),
## WordType = col_character(),
## CorpusFrequency = col_double()
## )
head(marginal_verbs)
Gender <chr> | Age <dbl> | AgeGroup <chr> | Education <chr> | City <chr> | SubjectCode <chr> | Score <chr> | GivenScore <dbl> | Stimulus <chr> | Prefix <chr> | |
---|---|---|---|---|---|---|---|---|---|---|
female | 16 | child | school | Izhevsk | AA | A | 5 | utochnit | u | |
female | 16 | child | school | Izhevsk | AA | A | 5 | ozhestochit | o | |
female | 16 | child | school | Izhevsk | AA | E | 1 | ovneshnit | o | |
female | 16 | child | school | Izhevsk | AA | E | 1 | oblusit | o | |
female | 16 | child | school | Izhevsk | AA | A | 5 | osvezhit | o | |
female | 16 | child | school | Izhevsk | AA | A | 5 | oschastlivit | o |
Переменные в датасете:
В этом датасете представлены три нанайских гласных i, ɪ и e, произнесенные нанайским носителем мужского пола из селения Джуен. Каждая строчка — отдельное произнесение. Переменные:
nanai <- read_csv("https://goo.gl/9uGBoQ")
## Parsed with column specification:
## cols(
## sound = col_character(),
## f1 = col_double(),
## f2 = col_double()
## )
nanai %>%
ggplot(aes(f2, f1, label = sound, color = sound))+
geom_text()+
geom_rug()+
scale_y_reverse()+
scale_x_reverse()+
stat_ellipse()+
theme_bw()+
theme(legend.position = "none")+
labs(title = "Нанайские гласные в произнесении мужчины из селения Джуен")
Мы хотим чего-то такого: y⏟[−∞,+∞]=β0+β1⋅x1+β2⋅x2+⋯+βk⋅xk+εi⏟[−∞,+∞] Вероятность — (в классической статистике) отношение количества успехов к общему числу событий: p=# успехов# неудач+# успехов,область значений: [0,1] Шансы — отношение количества успехов к количеству неудач: odds=p1−p=p(успеха)p(неудачи),область значений: [0,+∞] Натуральный логарифм шансов: log(odds),область значений: [−∞,+∞]
Но, что нам говорит логарифм шансов? Как нам его интерпретировать?
data_frame(n = 10,
success = 1:9,
failure = n - success,
prob.1 = success/(success+failure),
odds = success/failure,
log_odds = log(odds),
prob.2 = exp(log_odds)/(1+exp(log_odds)))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
n <dbl> | success <int> | failure <dbl> | prob.1 <dbl> | odds <dbl> | log_odds <dbl> | prob.2 <dbl> |
---|---|---|---|---|---|---|
10 | 1 | 9 | 0.1 | 0.1111111 | -2.1972246 | 0.1 |
10 | 2 | 8 | 0.2 | 0.2500000 | -1.3862944 | 0.2 |
10 | 3 | 7 | 0.3 | 0.4285714 | -0.8472979 | 0.3 |
10 | 4 | 6 | 0.4 | 0.6666667 | -0.4054651 | 0.4 |
10 | 5 | 5 | 0.5 | 1.0000000 | 0.0000000 | 0.5 |
10 | 6 | 4 | 0.6 | 1.5000000 | 0.4054651 | 0.6 |
10 | 7 | 3 | 0.7 | 2.3333333 | 0.8472979 | 0.7 |
10 | 8 | 2 | 0.8 | 4.0000000 | 1.3862944 | 0.8 |
10 | 9 | 1 | 0.9 | 9.0000000 | 2.1972246 | 0.9 |
Как связаны вероятность и логарифм шансов: log(odds)=log(p1−p) p=exp(log(odds))1+exp(log(odds))
Как связаны вероятность и логарифм шансов:
data_frame(p = seq(0, 1, 0.001),
log_odds = log(p/(1-p))) %>%
ggplot(aes(log_odds, p))+
geom_line()+
labs(x = latex2exp::TeX("$log\\left(\\frac{p}{1-p}\\right)$"))+
theme_bw()
lm_0 <- lm(as.integer(ejectives)~1, data = ej_n_cons)
lm_1 <- lm(as.integer(ejectives)~consonants, data = ej_n_cons)
lm_0
##
## Call:
## lm(formula = as.integer(ejectives) ~ 1, data = ej_n_cons)
##
## Coefficients:
## (Intercept)
## 1.316
lm_1
##
## Call:
## lm(formula = as.integer(ejectives) ~ consonants, data = ej_n_cons)
##
## Coefficients:
## (Intercept) consonants
## 0.4611 0.0353
Первая модель: ejectives=1.316×consonants Вторая модель: ejectives=0.4611+0.0353×consonants
ej_n_cons %>%
ggplot(aes(consonants, as.integer(ejectives)))+
geom_point()+
geom_smooth(method = "lm")+
theme_bw()+
labs(y = "ejectives (yes = 2, no = 1)")
Будьте осторожны, glm
не работает с тибблом.
logit_0 <- glm(ejectives~1, family = "binomial", data = ej_n_cons)
summary(logit_0)
##
## Call:
## glm(formula = ejectives ~ 1, family = "binomial", data = ej_n_cons)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8712 -0.8712 -0.8712 1.5183 1.5183
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7732 0.4935 -1.567 0.117
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23.699 on 18 degrees of freedom
## Residual deviance: 23.699 on 18 degrees of freedom
## AIC: 25.699
##
## Number of Fisher Scoring iterations: 4
logit_0$coefficients
## (Intercept)
## -0.7731899
table(ej_n_cons$ejectives)
##
## no yes
## 13 6
log(6/13) # β0
## [1] -0.7731899
6/(13+6) # p
## [1] 0.3157895
exp(log(6/13))/(1+exp(log(6/13))) # p
## [1] 0.3157895
logit_1 <- glm(ejectives~consonants, family = "binomial", data = ej_n_cons)
summary(logit_1)
##
## Call:
## glm(formula = ejectives ~ consonants, family = "binomial", data = ej_n_cons)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.08779 -0.49331 -0.20265 0.02254 2.45384
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.1123 6.1266 -1.977 0.0480 *
## consonants 0.4576 0.2436 1.878 0.0603 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23.699 on 18 degrees of freedom
## Residual deviance: 12.192 on 17 degrees of freedom
## AIC: 16.192
##
## Number of Fisher Scoring iterations: 6
logit_1$coefficients
## (Intercept) consonants
## -12.1123347 0.4576095
ej_n_cons %>%
mutate(ejectives = as.integer(ejectives)-1) %>%
ggplot(aes(consonants, ejectives)) +
geom_point()+
theme_bw()+
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE)
Какова вероятность, что в языке с 29 согласными есть абруптивные?
logit_1$coefficients
## (Intercept) consonants
## -12.1123347 0.4576095
log(p1−p)i=β0+β1×consinantsi+ϵi log(p1−p)=−12.1123347+0.4576095×29=1.158341 p=e1.1583411+e1.158341=0.7610311
# log(odds)
predict(logit_1, newdata = data.frame(consonants = 29))
## 1
## 1.158341
# p
predict(logit_1, newdata = data.frame(consonants = 29), type = "response")
## 1
## 0.7610312
logit_2 <- glm(ejectives~area, family = "binomial", data = ej_n_cons)
summary(logit_2)
##
## Call:
## glm(formula = ejectives ~ area, family = "binomial", data = ej_n_cons)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.66511 -0.55525 -0.00013 0.75853 1.97277
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.319e-17 1.000e+00 0.000 1.000
## areaAustralia -1.857e+01 6.523e+03 -0.003 0.998
## areaEurasia -1.792e+00 1.472e+00 -1.217 0.224
## areaNorth America 1.099e+00 1.528e+00 0.719 0.472
## areaPapua -1.857e+01 6.523e+03 -0.003 0.998
## areaSouth America -1.857e+01 4.612e+03 -0.004 0.997
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23.699 on 18 degrees of freedom
## Residual deviance: 15.785 on 13 degrees of freedom
## AIC: 27.785
##
## Number of Fisher Scoring iterations: 17
logit_2$coefficients
## (Intercept) areaAustralia areaEurasia areaNorth America
## 1.318587e-17 -1.856607e+01 -1.791759e+00 1.098612e+00
## areaPapua areaSouth America
## -1.856607e+01 -1.856607e+01
table(ej_n_cons$ejectives, ej_n_cons$area)
##
## Africa Australia Eurasia North America Papua South America
## no 2 1 6 1 1 2
## yes 2 0 1 3 0 0
log(1/6) # Eurasia
## [1] -1.791759
log(3/1) # North America
## [1] 1.098612
logit_3 <- glm(ejectives~consonants+area, family = "binomial", data = ej_n_cons)
summary(logit_3)
##
## Call:
## glm(formula = ejectives ~ consonants + area, family = "binomial",
## data = ej_n_cons)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.54011 -0.18623 -0.00012 0.00023 1.53307
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.1760 15.1089 -1.402 0.161
## consonants 0.8137 0.5653 1.439 0.150
## areaAustralia -16.2910 10754.0138 -0.002 0.999
## areaEurasia -1.2069 3.9399 -0.306 0.759
## areaNorth America 4.0966 4.8563 0.844 0.399
## areaPapua -4.8995 10754.0184 0.000 1.000
## areaSouth America -17.1162 7065.6839 -0.002 0.998
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 23.6989 on 18 degrees of freedom
## Residual deviance: 6.7901 on 12 degrees of freedom
## AIC: 20.79
##
## Number of Fisher Scoring iterations: 18
AIC(logit_0)
## [1] 25.69888
AIC(logit_1)
## [1] 16.19167
AIC(logit_2)
## [1] 27.78549
AIC(logit_3)
## [1] 20.79005
Для того, чтобы интерпретировать коэффициенты нужно проделать трансформацю:
(exp(logit_1$coefficients)-1)*100
## (Intercept) consonants
## -99.99945 58.02918
Перед нами процентное изменние шансов при увеличении независимой переменной на 1.
Было предложено много аналогов R2, например, McFadden’s R squared:
pscl::pR2(logit_1)
## llh llhNull G2 McFadden r2ML r2CU
## -6.0958355 -11.8494421 11.5072132 0.4855593 0.4542765 0.6373812
marginal_verbs$Score <- factor(marginal_verbs$Score)
levels(marginal_verbs$Score)
## [1] "A" "B" "C" "D" "E"
ordinal <- MASS::polr(Score~Prefix+WordType+CorpusFrequency, data = marginal_verbs)
summary(ordinal)
##
## Re-fitting to get Hessian
## Call:
## MASS::polr(formula = Score ~ Prefix + WordType + CorpusFrequency,
## data = marginal_verbs)
##
## Coefficients:
## Value Std. Error t value
## Prefixu 0.136619 5.286e-02 2.584
## WordTypenonce 1.340603 5.693e-02 23.549
## WordTypestandard -4.655327 1.251e-01 -37.211
## CorpusFrequency -0.001015 7.879e-05 -12.876
##
## Intercepts:
## Value Std. Error t value
## A|B -2.6275 0.0753 -34.8784
## B|C -1.4531 0.0552 -26.3246
## C|D -0.2340 0.0479 -4.8853
## D|E 0.7324 0.0492 14.8986
##
## Residual Deviance: 13138.47
## AIC: 13154.47
ordinal$coefficients
## Prefixu WordTypenonce WordTypestandard CorpusFrequency
## 0.136619412 1.340602696 -4.655327418 -0.001014583
Как и раньше, можно преобразовать коэффициенты:
(exp(ordinal$coefficients)-1)*100
## Prefixu WordTypenonce WordTypestandard CorpusFrequency
## 14.6391763 282.1345921 -99.0489201 -0.1014068
log(p(A)p(B|C|D|E))=−2.6275+0.136619412×Prefixu+1.340602696×WordTypenonce−
−4.655327418×WordTypestandard−0.001014583×CorpusFrequency log(p(A|B)p(C|D|E))=−1.4531+0.136619412×Prefixu+1.340602696×WordTypenonce −4.655327418×WordTypestandard−0.001014583×CorpusFrequency log(p(A|B|C)p(D|E))=−0.2340+0.136619412×Prefixu+1.340602696×WordTypenonce −4.655327418×WordTypestandard−0.001014583×CorpusFrequency log(p(A|B|C|D)p(E))=0.7324+0.136619412×Prefixu+1.340602696×WordTypenonce −4.655327418×WordTypestandard−0.001014583×CorpusFrequency
head(predict(ordinal))
## [1] A A E E A A
## Levels: A B C D E
head(predict(ordinal, type = "probs"))
## A B C D E
## 1 0.99178000 0.005665533 0.001798242 0.0004683707 0.0002878525
## 2 0.93841926 0.041706786 0.013917668 0.0036817261 0.0022745617
## 3 0.06764594 0.122509282 0.252611927 0.2334448870 0.3237879649
## 4 0.01855865 0.039108932 0.113894002 0.1808984667 0.6475399508
## 5 0.90986002 0.060436871 0.020738032 0.0055351143 0.0034299624
## 6 0.91496678 0.057117954 0.019500629 0.0051963745 0.0032182669
marginal_verbs <- cbind(marginal_verbs, predict(ordinal, type = "probs"))
marginal_verbs %>%
gather(score, predictions, A:E) %>%
ggplot(aes(x = score, y = predictions, fill = score)) +
geom_col(position = "dodge")+
facet_grid(Prefix~WordType)+
theme_bw()
mult <- nnet::multinom(sound~f1+f2, data = nanai)
## # weights: 12 (6 variable)
## initial value 462.515774
## iter 10 value 51.522626
## iter 20 value 46.817442
## iter 30 value 44.829080
## iter 40 value 44.807654
## iter 40 value 44.807654
## final value 44.807654
## converged
mult
## Call:
## nnet::multinom(formula = sound ~ f1 + f2, data = nanai)
##
## Coefficients:
## (Intercept) f1 f2
## i -22.85202 -0.04263175 0.02315226
## ɪ -41.46147 0.02360077 0.01937067
##
## Residual Deviance: 89.61531
## AIC: 101.6153
log(p(e)p(ɪ))=40.88718−0.02298543×f1−0.019176619×f2 log(p(i)p(ɪ))=18.15078−0.06593461×f1+0.003949284×f2
nanai %>%
mutate(prediction = predict(mult),
correctness = sound == prediction) %>%
ggplot(aes(f1, f2, label = sound, color = correctness))+
geom_text(aes(size = !correctness), show.legend = FALSE)+
scale_y_reverse()+
scale_x_reverse()+
theme_bw()+
labs(title = "Нанайские гласные в произнесении мужчины из селения Джуен",
subtitle = "мультиномиальная регрессия")
## Warning: Using size for a discrete variable is not advised.