library(tidyverse)
library(splines)
library(mgcv)

theme_set(theme_minimal()+theme(legend.position = "bottom"))

1 Полиномы, сплайны, GAM

1.1 Введение

Давайте на этой паре будем работать с таким абстрактным полиномом:

set.seed(42)
poly <- tibble(x = seq(0, 0.5, length.out = 500),
               y = 0.2*x^9*(10*(1-x))^6+10*(10*x)^2*(1-x)^10+rnorm(500, sd = 1.5))

poly %>% 
  ggplot(aes(x, y))+
  geom_point()

1.2 Полиномы

\(y_i = \sum_{j=0}^k \beta_j \times x^j_i + \epsilon_i = \beta_0 \times x^0_0 + \beta_1 \times x^1_i + ... \beta_k \times x^k_i + \epsilon_i\)

poly_fit1 <- lm(y ~ poly(x, 2, raw = TRUE), data = poly)
poly_fit2 <- lm(y ~ x+I(x^2), data = poly)

summary(poly_fit1)

## 
## Call:
## lm(formula = y ~ poly(x, 2, raw = TRUE), data = poly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4650 -1.2176 -0.0068  1.2366  5.2724 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               1.5279     0.2325   6.572 1.26e-10 ***
## poly(x, 2, raw = TRUE)1  11.7709     2.1479   5.480 6.77e-08 ***
## poly(x, 2, raw = TRUE)2 -12.9025     4.1589  -3.102  0.00203 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.74 on 497 degrees of freedom
## Multiple R-squared:  0.1777, Adjusted R-squared:  0.1744 
## F-statistic:  53.7 on 2 and 497 DF,  p-value: < 2.2e-16

summary(poly_fit2)

## 
## Call:
## lm(formula = y ~ x + I(x^2), data = poly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4650 -1.2176 -0.0068  1.2366  5.2724 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.5279     0.2325   6.572 1.26e-10 ***
## x            11.7709     2.1479   5.480 6.77e-08 ***
## I(x^2)      -12.9025     4.1589  -3.102  0.00203 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.74 on 497 degrees of freedom
## Multiple R-squared:  0.1777, Adjusted R-squared:  0.1744 
## F-statistic:  53.7 on 2 and 497 DF,  p-value: < 2.2e-16

tibble(x = poly$x,
       fit = poly_fit2$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_point(data = poly, aes(x, y))+
  geom_line(color = "blue", size = 2)+
  labs(caption = poly_fit2$call)

poly_fit3 <- lm(y ~ poly(x, 3, raw = TRUE), data = poly)

summary(poly_fit3)

## 
## Call:
## lm(formula = y ~ poly(x, 3, raw = TRUE), data = poly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8629 -0.9488 -0.0517  0.9239  4.7424 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -0.8437     0.2636  -3.201  0.00146 ** 
## poly(x, 3, raw = TRUE)1   68.9759     4.5695  15.095  < 2e-16 ***
## poly(x, 3, raw = TRUE)2 -299.2139    21.2489 -14.081  < 2e-16 ***
## poly(x, 3, raw = TRUE)3  381.7486    27.9340  13.666  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.484 on 496 degrees of freedom
## Multiple R-squared:  0.4026, Adjusted R-squared:  0.399 
## F-statistic: 111.4 on 3 and 496 DF,  p-value: < 2.2e-16

tibble(x = poly$x,
       fit = poly_fit3$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_point(data = poly, aes(x, y))+
  geom_line(color = "blue", size = 2)+
  labs(caption = poly_fit3$call)

Загрузите данные по продаже шампуня (Makridakis, Wheelwright and Hyndman 1998), используя функцию read_csv(). Постройте регрессию, моделирующую связь между переменными полиномом второй степени. В ответе превидите предсказание модели для 42 дня (с точностью до двух знаков после запятой).

Загрузите данные по продаже авакадо (см. исходное описание). Визуализируйте полученные данные (x = Date, y = AveragePrice, color = type), сделайте фасетизацию по переменной region, а при помощи функции geom_smooth(method = "lm", formula = ... нанесите на график полиномы 12-ой степени. В каком регионе, как кажется, наименьший разрыв в средней цене обычных и органических авокадо.

1.3 Сплайны

Альтернативный подход: сделать кусочную функцию. В результате данные делятся на узлы (knots), и в каждом узле получившаяся функция, а также ее первая и вторая производная должны быть плавными.

library(splines)
poly_fit3 <- lm(y ~ bs(x), data = poly)
summary(poly_fit3)

## 
## Call:
## lm(formula = y ~ bs(x), data = poly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8629 -0.9488 -0.0517  0.9239  4.7424 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.8437     0.2636  -3.201  0.00146 ** 
## bs(x)1       11.4960     0.7616  15.095  < 2e-16 ***
## bs(x)2       -1.9425     0.4835  -4.018 6.78e-05 ***
## bs(x)3        7.4030     0.4161  17.792  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.484 on 496 degrees of freedom
## Multiple R-squared:  0.4026, Adjusted R-squared:  0.399 
## F-statistic: 111.4 on 3 and 496 DF,  p-value: < 2.2e-16

tibble(x = poly$x,
       fit = poly_fit3$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = poly, aes(x, y))+
  labs(caption = poly_fit3$call) ->
  p1

as_tibble(cbind(bs(poly$x), x = poly$x)) %>% 
  gather(spline, value, -x) %>% 
  ggplot(aes(x, value, color = spline))+
  geom_line() ->
  s1

gridExtra::grid.arrange(p1, s1)

poly_fit4 <- lm(y ~ bs(x, knots = 5), data = poly)
summary(poly_fit4)

## 
## Call:
## lm(formula = y ~ bs(x, knots = 5), data = poly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8629 -0.9488 -0.0517  0.9239  4.7424 
## 
## Coefficients: (1 not defined because of singularities)
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.8437     0.2636  -3.201  0.00146 ** 
## bs(x, knots = 5)1  11.4960     0.7616  15.095  < 2e-16 ***
## bs(x, knots = 5)2  -1.9425     0.4835  -4.018 6.78e-05 ***
## bs(x, knots = 5)3   7.4030     0.4161  17.792  < 2e-16 ***
## bs(x, knots = 5)4       NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.484 on 496 degrees of freedom
## Multiple R-squared:  0.4026, Adjusted R-squared:  0.399 
## F-statistic: 111.4 on 3 and 496 DF,  p-value: < 2.2e-16

tibble(x = poly$x,
       fit = poly_fit4$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(dat = poly, aes(x,y))+
  labs(caption = poly_fit4$call) ->
  p2

as_tibble(cbind(bs(poly$x, knots = 5), x = poly$x)) %>% 
  gather(spline, value, -x) %>% 
  ggplot(aes(x, value, color = spline))+
  geom_line() ->
  s2

gridExtra::grid.arrange(p2, s2)

poly_fit5 <- lm(y ~ bs(x, degree = 5), data = poly)
summary(poly_fit5)

## 
## Call:
## lm(formula = y ~ bs(x, degree = 5), data = poly)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4925 -0.9958  0.0199  0.9923  4.4759 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.03152    0.38659   0.082   0.9350    
## bs(x, degree = 5)1  1.72723    1.56785   1.102   0.2711    
## bs(x, degree = 5)2 13.75698    1.99809   6.885 1.76e-11 ***
## bs(x, degree = 5)3 -5.86063    2.39313  -2.449   0.0147 *  
## bs(x, degree = 5)4  4.75438    1.19050   3.994 7.50e-05 ***
## bs(x, degree = 5)5  5.80448    0.58906   9.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.466 on 494 degrees of freedom
## Multiple R-squared:  0.4197, Adjusted R-squared:  0.4138 
## F-statistic: 71.45 on 5 and 494 DF,  p-value: < 2.2e-16

tibble(x = poly$x,
       fit = poly_fit5$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(dat = poly, aes(x,y))+
  labs(caption = poly_fit5$call) ->
  p3

as_tibble(cbind(bs(poly$x, degree = 5), x = poly$x)) %>% 
  gather(spline, value, -x) %>% 
  ggplot(aes(x, value, color = spline))+
  geom_line() ->
  s3

gridExtra::grid.arrange(p3, s3)

Загрузите данные по продаже шампуня (Makridakis, Wheelwright and Hyndman 1998), используя функцию read_csv(). Постройте регрессию, моделирующую связь между переменными сплайнами второй степени. В ответе превидите предсказание модели для 42 дня (с точностью до двух знаков после запятой).

1.4 GAM

1.4.1 Что такое GAM

Согласно теореме Колмогорова – Арнольда, каждая непрерывная функция может быть представлена в виде композиции непрерывных функций одной переменной. Композиция — применение одной функции к результатам другой: \((G\circ F)(x) = G(F(x))\). Подробнее см. лекции Виктора Клепцына. Проблема осталась только одна: эта формула не дает никакого инсайта по поводу того, как эту функциб получить. На практике фитят теми же кубическими сплайнами (но не обязательно), используя Backfitting algorithm.

1.4.2 Ваш первый GAM

library(mgcv)
poly_fit4 <- gam(y ~ s(x), data = poly)
summary(poly_fit4)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## y ~ s(x)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.39434    0.06571   51.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df     F p-value    
## s(x) 5.044  6.145 55.26  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.411   Deviance explained = 41.7%
## GCV =  2.185  Scale est. = 2.1586    n = 500

edf – effective degrees of freedom (edf = 1 — прямая, edf = 2 — полином второй степени и т. д.) Высокий edf не значит статистическую значимость: высокий edf может не иметь статистической значимости и наоборот.

tibble(x = poly$x,
       fit = poly_fit4$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = poly, aes(x, y))+
  labs(caption = poly_fit4$call)

В пакете mgcv достаточно много усилий положено для визуализации, так что имеет смысл посмотреть ?plot.gam. Когда вы рисуете модель используйте аргумент plot(your_model, all.terms = TRUE)

1.4.3 Параметры сглаживания

В GAM два параметра сглаживания:

коэффициент сглаживания
количество функций, которыми сглаживается

1.4.3.1 Настройка коэффициента сглаживания:

слишком большой

tibble(x = shampoo$day,
       fit = gam(sales_of_shampoo ~ s(day, sp = 50), data = shampoo)$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = shampoo, aes(day, sales_of_shampoo))+
  labs(caption = poly_fit4$call)

слишком маленький

tibble(x = shampoo$day,
       fit = gam(sales_of_shampoo ~ s(day, sp = 1e-5), data = shampoo)$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = shampoo, aes(day, sales_of_shampoo))+
  labs(caption = poly_fit4$call)

в самый раз… наверное…

tibble(x = shampoo$day,
       fit = gam(sales_of_shampoo ~ s(day, sp = 0.5), data = shampoo)$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = shampoo, aes(day, sales_of_shampoo))+
  labs(caption = poly_fit4$call)

а вот что выдает хваленный method = "REML":

tibble(x = shampoo$day,
       fit = gam(sales_of_shampoo ~ s(day), data = shampoo, method = "REML")$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = shampoo, aes(day, sales_of_shampoo))+
  labs(caption = poly_fit4$call)

1.4.3.2 Настройка количества базовых функций:

Не может превышать количества наблюдений.

слишком много

tibble(x = shampoo$day,
       fit = gam(sales_of_shampoo ~ s(day, k = 30), data = shampoo)$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = shampoo, aes(day, sales_of_shampoo))+
  labs(caption = poly_fit4$call)

слишком мало

tibble(x = shampoo$day,
       fit = gam(sales_of_shampoo ~ s(day, sp = 1), data = shampoo)$fitted.values) %>% 
  ggplot(aes(x, fit))+
  geom_line(color = "blue", size = 2)+
  geom_point(data = shampoo, aes(day, sales_of_shampoo))+
  labs(caption = poly_fit4$call)

1.4.4 Больше предикторов

Если вы хотите использовать категориальные предикторы переведите их в факторы!

avocado %>% 
  filter(region == "Sacramento") %>% 
  mutate(a_type = factor(a_type)) ->
  avocado_sacramento

## модель со "случайным свободным членом"
avocado_gam_1 <- gam(AveragePrice ~ s(Date2)+a_type, data = avocado_sacramento, method = "REML")
summary(avocado_gam_1)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## AveragePrice ~ s(Date2) + a_type
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.27396    0.01470   86.68   <2e-16 ***
## a_typeorganic  0.69521    0.02078   33.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##            edf Ref.df     F p-value    
## s(Date2) 8.317  8.871 42.71  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.816   Deviance explained = 82.1%
## -REML = -56.228  Scale est. = 0.036505  n = 338

tibble(x = avocado_sacramento$Date2,
       a_type = avocado_sacramento$a_type,
       fit = avocado_gam_1$fitted.values) %>% 
  ggplot(aes(x, fit, color = a_type))+
  geom_line()+
  geom_point(data = avocado_sacramento, aes(Date2, AveragePrice, color = a_type))+
  labs(caption = avocado_gam_1$call)

## модель со "случайным интерсептом"
avocado_gam_2 <- gam(AveragePrice ~ s(Date2, by = a_type), data = avocado_sacramento, method = "REML")
summary(avocado_gam_2)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## AveragePrice ~ s(Date2, by = a_type)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.62157    0.02141   75.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df      F  p-value    
## s(Date2):a_typeconventional 4.364  5.368  3.747  0.00231 ** 
## s(Date2):a_typeorganic      5.172  6.287 11.793 2.03e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.218   Deviance explained =   24%
## -REML = 180.56  Scale est. = 0.15496   n = 338

tibble(x = avocado_sacramento$Date2,
       a_type = avocado_sacramento$a_type,
       fit = avocado_gam_2$fitted.values) %>% 
  ggplot(aes(x, fit, color = a_type))+
  geom_line()+
  geom_point(data = avocado_sacramento, aes(Date2, AveragePrice, color = a_type))+
  labs(caption = avocado_gam_2$call)

## модель со "случайным интерсептом и свободным членом"
avocado_gam_3 <- gam(AveragePrice ~ s(Date2, by = a_type)+a_type, data = avocado_sacramento, method = "REML")
summary(avocado_gam_3)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## AveragePrice ~ s(Date2, by = a_type) + a_type
## 
## Parametric coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.27396    0.01217   104.7   <2e-16 ***
## a_typeorganic  0.69521    0.01721    40.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                               edf Ref.df     F p-value    
## s(Date2):a_typeconventional 8.235  8.840 21.98  <2e-16 ***
## s(Date2):a_typeorganic      8.181  8.818 58.18  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.874   Deviance explained =   88%
## -REML = -103.28  Scale est. = 0.025028  n = 338

tibble(x = avocado_sacramento$Date2,
       a_type = avocado_sacramento$a_type,
       fit = avocado_gam_3$fitted.values) %>% 
  ggplot(aes(x, fit, color = a_type))+
  geom_line()+
  geom_point(data = avocado_sacramento, aes(Date2, AveragePrice, color = a_type))+
  labs(caption = avocado_gam_3$call)

1.4.5 `gam.check()`

gam.check(avocado_gam_3)

## 
## Method: REML   Optimizer: outer newton
## full convergence after 6 iterations.
## Gradient range [-1.923671e-07,5.659194e-08]
## (score -103.2803 & scale 0.02502825).
## Hessian positive definite, eigenvalue range [2.624738,167.158].
## Model rank =  20 / 20 
## 
## Basis dimension (k) checking results. Low p-value (k-index<1) may
## indicate that k is too low, especially if edf is close to k'.
## 
##                               k'  edf k-index p-value    
## s(Date2):a_typeconventional 9.00 8.24    0.63  <2e-16 ***
## s(Date2):a_typeorganic      9.00 8.18    0.63  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.4.6 Проверка на concurvity

concurvity(avocado_gam_3)

##          para s(Date2):a_typeconventional s(Date2):a_typeorganic
## worst     0.5                3.478637e-26           3.144981e-26
## observed  0.5                2.941044e-28           6.937989e-28
## estimate  0.5                5.170849e-29           5.214372e-29

Нужно смотреть на значение в строчке worst, если оно высокое, значит мы наблюдаем concurvity.

concurvity(avocado_gam_3, full = FALSE)

## $worst
##                                     para s(Date2):a_typeconventional
## para                        1.000000e+00                1.628855e-26
## s(Date2):a_typeconventional 1.593285e-26                1.000000e+00
## s(Date2):a_typeorganic      1.579041e-26                8.138178e-31
##                             s(Date2):a_typeorganic
## para                                  1.574120e-26
## s(Date2):a_typeconventional           2.549073e-31
## s(Date2):a_typeorganic                1.000000e+00
## 
## $observed
##                                     para s(Date2):a_typeconventional
## para                        1.000000e+00                1.673823e-28
## s(Date2):a_typeconventional 1.593285e-26                1.000000e+00
## s(Date2):a_typeorganic      1.579041e-26                1.004808e-31
##                             s(Date2):a_typeorganic
## para                                  3.497205e-28
## s(Date2):a_typeconventional           8.383371e-34
## s(Date2):a_typeorganic                1.000000e+00
## 
## $estimate
##                                     para s(Date2):a_typeconventional
## para                        1.000000e+00                2.607191e-29
## s(Date2):a_typeconventional 1.593285e-26                1.000000e+00
## s(Date2):a_typeorganic      1.579041e-26                5.699764e-32
##                             s(Date2):a_typeorganic
## para                                  2.607286e-29
## s(Date2):a_typeconventional           1.687953e-33
## s(Date2):a_typeorganic                1.000000e+00

1.5 Что дальше?

Logistic GAM
tensor smooths: te(), ti()

Что почитать, посмотреть?

Wood S. N. (2017) Generalized Additive Models: An Introduction with R
Ross N. - Nonlinear Modeling in R with GAMs версия онлайн, интерактивный сайт