library(tidyverse)
library(splines)
library(mgcv)
theme_set(theme_minimal()+theme(legend.position = "bottom"))
Давайте на этой паре будем работать с таким абстрактным полиномом:
set.seed(42)
poly <- tibble(x = seq(0, 0.5, length.out = 500),
y = 0.2*x^9*(10*(1-x))^6+10*(10*x)^2*(1-x)^10+rnorm(500, sd = 1.5))
poly %>%
ggplot(aes(x, y))+
geom_point()
\(y_i = \sum_{j=0}^k \beta_j \times x^j_i + \epsilon_i = \beta_0 \times x^0_0 + \beta_1 \times x^1_i + ... \beta_k \times x^k_i + \epsilon_i\)
poly_fit1 <- lm(y ~ poly(x, 2, raw = TRUE), data = poly)
poly_fit2 <- lm(y ~ x+I(x^2), data = poly)
summary(poly_fit1)
##
## Call:
## lm(formula = y ~ poly(x, 2, raw = TRUE), data = poly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4650 -1.2176 -0.0068 1.2366 5.2724
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5279 0.2325 6.572 1.26e-10 ***
## poly(x, 2, raw = TRUE)1 11.7709 2.1479 5.480 6.77e-08 ***
## poly(x, 2, raw = TRUE)2 -12.9025 4.1589 -3.102 0.00203 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.74 on 497 degrees of freedom
## Multiple R-squared: 0.1777, Adjusted R-squared: 0.1744
## F-statistic: 53.7 on 2 and 497 DF, p-value: < 2.2e-16
summary(poly_fit2)
##
## Call:
## lm(formula = y ~ x + I(x^2), data = poly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4650 -1.2176 -0.0068 1.2366 5.2724
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5279 0.2325 6.572 1.26e-10 ***
## x 11.7709 2.1479 5.480 6.77e-08 ***
## I(x^2) -12.9025 4.1589 -3.102 0.00203 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.74 on 497 degrees of freedom
## Multiple R-squared: 0.1777, Adjusted R-squared: 0.1744
## F-statistic: 53.7 on 2 and 497 DF, p-value: < 2.2e-16
tibble(x = poly$x,
fit = poly_fit2$fitted.values) %>%
ggplot(aes(x, fit))+
geom_point(data = poly, aes(x, y))+
geom_line(color = "blue", size = 2)+
labs(caption = poly_fit2$call)
poly_fit3 <- lm(y ~ poly(x, 3, raw = TRUE), data = poly)
summary(poly_fit3)
##
## Call:
## lm(formula = y ~ poly(x, 3, raw = TRUE), data = poly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8629 -0.9488 -0.0517 0.9239 4.7424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8437 0.2636 -3.201 0.00146 **
## poly(x, 3, raw = TRUE)1 68.9759 4.5695 15.095 < 2e-16 ***
## poly(x, 3, raw = TRUE)2 -299.2139 21.2489 -14.081 < 2e-16 ***
## poly(x, 3, raw = TRUE)3 381.7486 27.9340 13.666 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.484 on 496 degrees of freedom
## Multiple R-squared: 0.4026, Adjusted R-squared: 0.399
## F-statistic: 111.4 on 3 and 496 DF, p-value: < 2.2e-16
tibble(x = poly$x,
fit = poly_fit3$fitted.values) %>%
ggplot(aes(x, fit))+
geom_point(data = poly, aes(x, y))+
geom_line(color = "blue", size = 2)+
labs(caption = poly_fit3$call)
Загрузите данные по продаже шампуня (Makridakis, Wheelwright and Hyndman 1998), используя функцию
read_csv()
. Постройте регрессию, моделирующую связь между переменными полиномом второй степени. В ответе превидите предсказание модели для 42 дня (с точностью до двух знаков после запятой).
Загрузите данные по продаже авакадо (см. исходное описание). Визуализируйте полученные данные (x =
Date
, y =AveragePrice
, color =type
), сделайте фасетизацию по переменнойregion
, а при помощи функцииgeom_smooth(method = "lm", formula = ...
нанесите на график полиномы 12-ой степени. В каком регионе, как кажется, наименьший разрыв в средней цене обычных и органических авокадо.
Альтернативный подход: сделать кусочную функцию. В результате данные делятся на узлы (knots), и в каждом узле получившаяся функция, а также ее первая и вторая производная должны быть плавными.
library(splines)
poly_fit3 <- lm(y ~ bs(x), data = poly)
summary(poly_fit3)
##
## Call:
## lm(formula = y ~ bs(x), data = poly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8629 -0.9488 -0.0517 0.9239 4.7424
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8437 0.2636 -3.201 0.00146 **
## bs(x)1 11.4960 0.7616 15.095 < 2e-16 ***
## bs(x)2 -1.9425 0.4835 -4.018 6.78e-05 ***
## bs(x)3 7.4030 0.4161 17.792 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.484 on 496 degrees of freedom
## Multiple R-squared: 0.4026, Adjusted R-squared: 0.399
## F-statistic: 111.4 on 3 and 496 DF, p-value: < 2.2e-16
tibble(x = poly$x,
fit = poly_fit3$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = poly, aes(x, y))+
labs(caption = poly_fit3$call) ->
p1
as_tibble(cbind(bs(poly$x), x = poly$x)) %>%
gather(spline, value, -x) %>%
ggplot(aes(x, value, color = spline))+
geom_line() ->
s1
gridExtra::grid.arrange(p1, s1)
poly_fit4 <- lm(y ~ bs(x, knots = 5), data = poly)
summary(poly_fit4)
##
## Call:
## lm(formula = y ~ bs(x, knots = 5), data = poly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8629 -0.9488 -0.0517 0.9239 4.7424
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.8437 0.2636 -3.201 0.00146 **
## bs(x, knots = 5)1 11.4960 0.7616 15.095 < 2e-16 ***
## bs(x, knots = 5)2 -1.9425 0.4835 -4.018 6.78e-05 ***
## bs(x, knots = 5)3 7.4030 0.4161 17.792 < 2e-16 ***
## bs(x, knots = 5)4 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.484 on 496 degrees of freedom
## Multiple R-squared: 0.4026, Adjusted R-squared: 0.399
## F-statistic: 111.4 on 3 and 496 DF, p-value: < 2.2e-16
tibble(x = poly$x,
fit = poly_fit4$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(dat = poly, aes(x,y))+
labs(caption = poly_fit4$call) ->
p2
as_tibble(cbind(bs(poly$x, knots = 5), x = poly$x)) %>%
gather(spline, value, -x) %>%
ggplot(aes(x, value, color = spline))+
geom_line() ->
s2
gridExtra::grid.arrange(p2, s2)
poly_fit5 <- lm(y ~ bs(x, degree = 5), data = poly)
summary(poly_fit5)
##
## Call:
## lm(formula = y ~ bs(x, degree = 5), data = poly)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4925 -0.9958 0.0199 0.9923 4.4759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03152 0.38659 0.082 0.9350
## bs(x, degree = 5)1 1.72723 1.56785 1.102 0.2711
## bs(x, degree = 5)2 13.75698 1.99809 6.885 1.76e-11 ***
## bs(x, degree = 5)3 -5.86063 2.39313 -2.449 0.0147 *
## bs(x, degree = 5)4 4.75438 1.19050 3.994 7.50e-05 ***
## bs(x, degree = 5)5 5.80448 0.58906 9.854 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.466 on 494 degrees of freedom
## Multiple R-squared: 0.4197, Adjusted R-squared: 0.4138
## F-statistic: 71.45 on 5 and 494 DF, p-value: < 2.2e-16
tibble(x = poly$x,
fit = poly_fit5$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(dat = poly, aes(x,y))+
labs(caption = poly_fit5$call) ->
p3
as_tibble(cbind(bs(poly$x, degree = 5), x = poly$x)) %>%
gather(spline, value, -x) %>%
ggplot(aes(x, value, color = spline))+
geom_line() ->
s3
gridExtra::grid.arrange(p3, s3)
Загрузите данные по продаже шампуня (Makridakis, Wheelwright and Hyndman 1998), используя функцию
read_csv()
. Постройте регрессию, моделирующую связь между переменными сплайнами второй степени. В ответе превидите предсказание модели для 42 дня (с точностью до двух знаков после запятой).
Согласно теореме Колмогорова – Арнольда, каждая непрерывная функция может быть представлена в виде композиции непрерывных функций одной переменной. Композиция — применение одной функции к результатам другой: \((G\circ F)(x) = G(F(x))\). Подробнее см. лекции Виктора Клепцына. Проблема осталась только одна: эта формула не дает никакого инсайта по поводу того, как эту функциб получить. На практике фитят теми же кубическими сплайнами (но не обязательно), используя Backfitting algorithm.
library(mgcv)
poly_fit4 <- gam(y ~ s(x), data = poly)
summary(poly_fit4)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## y ~ s(x)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.39434 0.06571 51.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(x) 5.044 6.145 55.26 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.411 Deviance explained = 41.7%
## GCV = 2.185 Scale est. = 2.1586 n = 500
edf – effective degrees of freedom (edf = 1 — прямая, edf = 2 — полином второй степени и т. д.) Высокий edf не значит статистическую значимость: высокий edf может не иметь статистической значимости и наоборот.
tibble(x = poly$x,
fit = poly_fit4$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = poly, aes(x, y))+
labs(caption = poly_fit4$call)
В пакете
mgcv
достаточно много усилий положено для визуализации, так что имеет смысл посмотреть?plot.gam
. Когда вы рисуете модель используйте аргументplot(your_model, all.terms = TRUE)
В GAM два параметра сглаживания:
tibble(x = shampoo$day,
fit = gam(sales_of_shampoo ~ s(day, sp = 50), data = shampoo)$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = shampoo, aes(day, sales_of_shampoo))+
labs(caption = poly_fit4$call)
tibble(x = shampoo$day,
fit = gam(sales_of_shampoo ~ s(day, sp = 1e-5), data = shampoo)$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = shampoo, aes(day, sales_of_shampoo))+
labs(caption = poly_fit4$call)
tibble(x = shampoo$day,
fit = gam(sales_of_shampoo ~ s(day, sp = 0.5), data = shampoo)$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = shampoo, aes(day, sales_of_shampoo))+
labs(caption = poly_fit4$call)
method = "REML"
:tibble(x = shampoo$day,
fit = gam(sales_of_shampoo ~ s(day), data = shampoo, method = "REML")$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = shampoo, aes(day, sales_of_shampoo))+
labs(caption = poly_fit4$call)
Не может превышать количества наблюдений.
tibble(x = shampoo$day,
fit = gam(sales_of_shampoo ~ s(day, k = 30), data = shampoo)$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = shampoo, aes(day, sales_of_shampoo))+
labs(caption = poly_fit4$call)
tibble(x = shampoo$day,
fit = gam(sales_of_shampoo ~ s(day, sp = 1), data = shampoo)$fitted.values) %>%
ggplot(aes(x, fit))+
geom_line(color = "blue", size = 2)+
geom_point(data = shampoo, aes(day, sales_of_shampoo))+
labs(caption = poly_fit4$call)
Если вы хотите использовать категориальные предикторы переведите их в факторы!
avocado %>%
filter(region == "Sacramento") %>%
mutate(a_type = factor(a_type)) ->
avocado_sacramento
## модель со "случайным свободным членом"
avocado_gam_1 <- gam(AveragePrice ~ s(Date2)+a_type, data = avocado_sacramento, method = "REML")
summary(avocado_gam_1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## AveragePrice ~ s(Date2) + a_type
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.27396 0.01470 86.68 <2e-16 ***
## a_typeorganic 0.69521 0.02078 33.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Date2) 8.317 8.871 42.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.816 Deviance explained = 82.1%
## -REML = -56.228 Scale est. = 0.036505 n = 338
tibble(x = avocado_sacramento$Date2,
a_type = avocado_sacramento$a_type,
fit = avocado_gam_1$fitted.values) %>%
ggplot(aes(x, fit, color = a_type))+
geom_line()+
geom_point(data = avocado_sacramento, aes(Date2, AveragePrice, color = a_type))+
labs(caption = avocado_gam_1$call)
## модель со "случайным интерсептом"
avocado_gam_2 <- gam(AveragePrice ~ s(Date2, by = a_type), data = avocado_sacramento, method = "REML")
summary(avocado_gam_2)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## AveragePrice ~ s(Date2, by = a_type)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.62157 0.02141 75.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Date2):a_typeconventional 4.364 5.368 3.747 0.00231 **
## s(Date2):a_typeorganic 5.172 6.287 11.793 2.03e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.218 Deviance explained = 24%
## -REML = 180.56 Scale est. = 0.15496 n = 338
tibble(x = avocado_sacramento$Date2,
a_type = avocado_sacramento$a_type,
fit = avocado_gam_2$fitted.values) %>%
ggplot(aes(x, fit, color = a_type))+
geom_line()+
geom_point(data = avocado_sacramento, aes(Date2, AveragePrice, color = a_type))+
labs(caption = avocado_gam_2$call)
## модель со "случайным интерсептом и свободным членом"
avocado_gam_3 <- gam(AveragePrice ~ s(Date2, by = a_type)+a_type, data = avocado_sacramento, method = "REML")
summary(avocado_gam_3)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## AveragePrice ~ s(Date2, by = a_type) + a_type
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.27396 0.01217 104.7 <2e-16 ***
## a_typeorganic 0.69521 0.01721 40.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Date2):a_typeconventional 8.235 8.840 21.98 <2e-16 ***
## s(Date2):a_typeorganic 8.181 8.818 58.18 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.874 Deviance explained = 88%
## -REML = -103.28 Scale est. = 0.025028 n = 338
tibble(x = avocado_sacramento$Date2,
a_type = avocado_sacramento$a_type,
fit = avocado_gam_3$fitted.values) %>%
ggplot(aes(x, fit, color = a_type))+
geom_line()+
geom_point(data = avocado_sacramento, aes(Date2, AveragePrice, color = a_type))+
labs(caption = avocado_gam_3$call)
gam.check()
gam.check(avocado_gam_3)
##
## Method: REML Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-1.923671e-07,5.659194e-08]
## (score -103.2803 & scale 0.02502825).
## Hessian positive definite, eigenvalue range [2.624738,167.158].
## Model rank = 20 / 20
##
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
##
## k' edf k-index p-value
## s(Date2):a_typeconventional 9.00 8.24 0.63 <2e-16 ***
## s(Date2):a_typeorganic 9.00 8.18 0.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
concurvity(avocado_gam_3)
## para s(Date2):a_typeconventional s(Date2):a_typeorganic
## worst 0.5 3.478637e-26 3.144981e-26
## observed 0.5 2.941044e-28 6.937989e-28
## estimate 0.5 5.170849e-29 5.214372e-29
Нужно смотреть на значение в строчке worst
, если оно высокое, значит мы наблюдаем concurvity.
concurvity(avocado_gam_3, full = FALSE)
## $worst
## para s(Date2):a_typeconventional
## para 1.000000e+00 1.628855e-26
## s(Date2):a_typeconventional 1.593285e-26 1.000000e+00
## s(Date2):a_typeorganic 1.579041e-26 8.138178e-31
## s(Date2):a_typeorganic
## para 1.574120e-26
## s(Date2):a_typeconventional 2.549073e-31
## s(Date2):a_typeorganic 1.000000e+00
##
## $observed
## para s(Date2):a_typeconventional
## para 1.000000e+00 1.673823e-28
## s(Date2):a_typeconventional 1.593285e-26 1.000000e+00
## s(Date2):a_typeorganic 1.579041e-26 1.004808e-31
## s(Date2):a_typeorganic
## para 3.497205e-28
## s(Date2):a_typeconventional 8.383371e-34
## s(Date2):a_typeorganic 1.000000e+00
##
## $estimate
## para s(Date2):a_typeconventional
## para 1.000000e+00 2.607191e-29
## s(Date2):a_typeconventional 1.593285e-26 1.000000e+00
## s(Date2):a_typeorganic 1.579041e-26 5.699764e-32
## s(Date2):a_typeorganic
## para 2.607286e-29
## s(Date2):a_typeconventional 1.687953e-33
## s(Date2):a_typeorganic 1.000000e+00
te()
, ti()
Что почитать, посмотреть?