Time Series Analysis Method Summary(2)

Time Series

Time Series Analysis Method Summary

Yeongeun Jeon , Jung In Seo
10-02-2021

Data 불러오기

pacman::p_load( "dplyr", "xts", 
                "forecast", "dlm", "bsts",
                "caret", "caretEnsemble",
                "ggplot2")

library(doParallel)
library(parallel)

# cl <- makePSOCKcluster(detectCores())
# clusterEvalQ(cl, library(foreach))
# registerDoParallel(cores=cl)

registerDoParallel(cores=detectCores())


# Data 불러오기 ----------------------------------------------------------------

# In Mac
# guess_encoding("Amtrak.csv")
# Amtrak.data <- read.csv("Amtrak.csv", fileEncoding="EUC-KR")

Amtrak.data <- read.csv("C:/Users/User/Desktop/Amtrak.csv")
Amtrak.data$Month <- as.Date(Amtrak.data$Month, format = "%d/%m/%Y")               # For ggplot
ridership.ts <- ts(Amtrak.data$Ridership, start=c(1991,1), end=c(2004,3), freq=12)

Data 분할

train.ts     <- window(ridership.ts,start=c(1991,1), end=c(2001,3))   # Training Data
test.ts      <- window(ridership.ts,start=c(2001,4))                  # Test Data
n.test       <- length(test.ts)

예측 변수 생성

# 1. Fourier Term
FT      <- fourier(train.ts, K=2)                    # K : sine, cosine 쌍의 개수/시계열 데이터의 계절 주기가 2개 이상일 때, K는 계절 주기 수만큼 필요NAFT.Test <- fourier(train.ts, K=2, h=n.test)

# 2. Month
xts(ridership.ts, order = as.Date(ridership.ts))
               [,1]
1991-01-01 1708.917
1991-02-01 1620.586
1991-03-01 1972.715
1991-04-01 1811.665
1991-05-01 1974.964
1991-06-01 1862.356
1991-07-01 1939.860
1991-08-01 2013.264
1991-09-01 1595.657
1991-10-01 1724.924
1991-11-01 1675.667
1991-12-01 1813.863
1992-01-01 1614.827
1992-02-01 1557.088
1992-03-01 1891.223
1992-04-01 1955.981
1992-05-01 1884.714
1992-06-01 1623.042
1992-07-01 1903.309
1992-08-01 1996.712
1992-09-01 1703.897
1992-10-01 1810.000
1992-11-01 1861.601
1992-12-01 1875.122
1993-01-01 1705.259
1993-02-01 1618.535
1993-03-01 1836.709
1993-04-01 1957.043
1993-05-01 1917.185
1993-06-01 1882.398
1993-07-01 1933.009
1993-08-01 1996.167
1993-09-01 1672.841
1993-10-01 1752.827
1993-11-01 1720.377
1993-12-01 1734.292
1994-01-01 1563.365
1994-02-01 1573.959
1994-03-01 1902.639
1994-04-01 1833.888
1994-05-01 1831.049
1994-06-01 1775.755
1994-07-01 1867.508
1994-08-01 1906.608
1994-09-01 1685.632
1994-10-01 1778.546
1994-11-01 1775.995
1994-12-01 1783.350
1995-01-01 1548.415
1995-02-01 1496.925
1995-03-01 1798.316
1995-04-01 1732.895
1995-05-01 1772.345
1995-06-01 1761.207
1995-07-01 1791.655
1995-08-01 1874.820
1995-09-01 1571.309
1995-10-01 1646.948
1995-11-01 1672.631
1995-12-01 1656.845
1996-01-01 1381.758
1996-02-01 1360.852
1996-03-01 1558.575
1996-04-01 1608.420
1996-05-01 1696.696
1996-06-01 1693.183
1996-07-01 1835.516
1996-08-01 1942.573
1996-09-01 1551.401
1996-10-01 1686.508
1996-11-01 1576.204
1996-12-01 1700.433
1997-01-01 1396.588
1997-02-01 1371.690
1997-03-01 1707.522
1997-04-01 1654.604
1997-05-01 1762.903
1997-06-01 1775.800
1997-07-01 1934.219
1997-08-01 2008.055
1997-09-01 1615.924
1997-10-01 1773.910
1997-11-01 1732.368
1997-12-01 1796.626
1998-01-01 1570.330
1998-02-01 1412.691
1998-03-01 1754.641
1998-04-01 1824.932
1998-05-01 1843.289
1998-06-01 1825.964
1998-07-01 1968.172
1998-08-01 1921.645
1998-09-01 1669.597
1998-10-01 1791.474
1998-11-01 1816.714
1998-12-01 1846.754
1999-01-01 1599.427
1999-02-01 1548.804
1999-03-01 1832.333
1999-04-01 1839.720
1999-05-01 1846.498
1999-06-01 1864.852
1999-07-01 1965.743
1999-08-01 1949.002
1999-09-01 1607.373
1999-10-01 1803.664
1999-11-01 1850.309
1999-12-01 1836.435
2000-01-01 1541.660
2000-02-01 1616.928
2000-03-01 1919.538
2000-04-01 1971.493
2000-05-01 1992.301
2000-06-01 2009.763
2000-07-01 2053.996
2000-08-01 2097.471
2000-09-01 1823.706
2000-10-01 1976.997
2000-11-01 1981.408
2000-12-01 2000.153
2001-01-01 1683.148
2001-02-01 1663.404
2001-03-01 2007.928
2001-04-01 2023.792
2001-05-01 2047.008
2001-06-01 2072.913
2001-07-01 2126.717
2001-08-01 2202.638
2001-09-01 1707.693
2001-10-01 1950.716
2001-11-01 1973.614
2001-12-01 1984.729
2002-01-01 1759.629
2002-02-01 1770.595
2002-03-01 2019.912
2002-04-01 2048.398
2002-05-01 2068.763
2002-06-01 1994.267
2002-07-01 2075.258
2002-08-01 2026.560
2002-09-01 1734.155
2002-10-01 1916.771
2002-11-01 1858.345
2002-12-01 1996.352
2003-01-01 1778.033
2003-02-01 1749.489
2003-03-01 2066.466
2003-04-01 2098.899
2003-05-01 2104.911
2003-06-01 2129.671
2003-07-01 2223.349
2003-08-01 2174.360
2003-09-01 1931.406
2003-10-01 2121.470
2003-11-01 2076.054
2003-12-01 2140.677
2004-01-01 1831.508
2004-02-01 1838.006
2004-03-01 2132.446
Month  <- as.Date(ridership.ts) %>%                  # Date 추출NAlubridate::month()                                 # Month 추출NA## 퓨리에 항과 합치기
Train.X <- cbind("Month"= Month[1:length(train.ts)], FT) 
Test.X  <- cbind("Month"= Month[-(1:length(train.ts))], FT.Test) 

전통적 시계열 모형


1. Regression with ARIMA error


모형 적합

# Fit Trend + Seasonality Model
train.lm.trend.season <- tslm(train.ts ~ trend + I(trend^2) + season)  
summary(train.lm.trend.season)

Call:
tslm(formula = train.ts ~ trend + I(trend^2) + season)

Residuals:
     Min       1Q   Median       3Q      Max 
-213.775  -39.363    9.711   42.422  152.187 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.697e+03  2.768e+01  61.318  < 2e-16 ***
trend       -7.156e+00  7.293e-01  -9.812  < 2e-16 ***
I(trend^2)   6.074e-02  5.698e-03  10.660  < 2e-16 ***
season2     -4.325e+01  3.024e+01  -1.430  0.15556    
season3      2.600e+02  3.024e+01   8.598 6.60e-14 ***
season4      2.606e+02  3.102e+01   8.401 1.83e-13 ***
season5      2.938e+02  3.102e+01   9.471 6.89e-16 ***
season6      2.490e+02  3.102e+01   8.026 1.26e-12 ***
season7      3.606e+02  3.102e+01  11.626  < 2e-16 ***
season8      4.117e+02  3.102e+01  13.270  < 2e-16 ***
season9      9.032e+01  3.102e+01   2.911  0.00437 ** 
season10     2.146e+02  3.102e+01   6.917 3.29e-10 ***
season11     2.057e+02  3.103e+01   6.629 1.34e-09 ***
season12     2.429e+02  3.103e+01   7.829 3.44e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 70.92 on 109 degrees of freedom
Multiple R-squared:  0.8246,    Adjusted R-squared:  0.8037 
F-statistic: 39.42 on 13 and 109 DF,  p-value: < 2.2e-16
# Forecast
train.lm.trend.season.pred <- forecast(train.lm.trend.season, h=n.test, level=0) 
train.lm.trend.season.pred
         Point Forecast     Lo 0     Hi 0
Apr 2001       2004.271 2004.271 2004.271
May 2001       2045.419 2045.419 2045.419
Jun 2001       2008.675 2008.675 2008.675
Jul 2001       2128.560 2128.560 2128.560
Aug 2001       2187.911 2187.911 2187.911
Sep 2001       1875.032 1875.032 1875.032
Oct 2001       2007.896 2007.896 2007.896
Nov 2001       2007.662 2007.662 2007.662
Dec 2001       2053.740 2053.740 2053.740
Jan 2002       1819.752 1819.752 1819.752
Feb 2002       1785.569 1785.569 1785.569
Mar 2002       2098.014 2098.014 2098.014
Apr 2002       2107.922 2107.922 2107.922
May 2002       2150.528 2150.528 2150.528
Jun 2002       2115.242 2115.242 2115.242
Jul 2002       2236.585 2236.585 2236.585
Aug 2002       2297.394 2297.394 2297.394
Sep 2002       1985.972 1985.972 1985.972
Oct 2002       2120.294 2120.294 2120.294
Nov 2002       2121.518 2121.518 2121.518
Dec 2002       2169.054 2169.054 2169.054
Jan 2003       1936.523 1936.523 1936.523
Feb 2003       1903.798 1903.798 1903.798
Mar 2003       2217.701 2217.701 2217.701
Apr 2003       2229.067 2229.067 2229.067
May 2003       2273.131 2273.131 2273.131
Jun 2003       2239.303 2239.303 2239.303
Jul 2003       2362.104 2362.104 2362.104
Aug 2003       2424.371 2424.371 2424.371
Sep 2003       2114.406 2114.406 2114.406
Oct 2003       2250.186 2250.186 2250.186
Nov 2003       2252.868 2252.868 2252.868
Dec 2003       2301.862 2301.862 2301.862
Jan 2004       2070.789 2070.789 2070.789
Feb 2004       2039.522 2039.522 2039.522
Mar 2004       2354.883 2354.883 2354.883
checkresiduals(train.lm.trend.season) 


    Breusch-Godfrey test for serial correlation of order up to 24

data:  Residuals from Linear regression model
LM test = 57.142, df = 24, p-value = 0.0001599
# Fit ARIMA Model to training residuals
train.res.arima <- auto.arima(train.lm.trend.season$residuals)
summary(train.res.arima)
Series: train.lm.trend.season$residuals 
ARIMA(1,0,1) with zero mean 

Coefficients:
         ar1      ma1
      0.7416  -0.2286
s.e.  0.0972   0.1448

sigma^2 estimated as 2822:  log likelihood=-662.39
AIC=1330.77   AICc=1330.97   BIC=1339.21

Training set error measures:
                      ME     RMSE      MAE      MPE     MAPE
Training set -0.01267266 52.68668 40.29069 131.2931 202.8981
                  MASE        ACF1
Training set 0.5787867 0.005531221
train.res.arima.pred <- forecast(train.res.arima, h=n.test, level=0)
train.res.arima.pred$mean
              Jan          Feb          Mar          Apr          May
2001                                        3.090696e+00 2.292185e+00
2002 2.097959e-01 1.555931e-01 1.153941e-01 8.558095e-02 6.347029e-02
2003 5.809218e-03 4.308351e-03 3.195248e-03 2.369725e-03 1.757484e-03
2004 1.608565e-04 1.192977e-04 8.847596e-05                          
              Jun          Jul          Aug          Sep          Oct
2001 1.699977e+00 1.260772e+00 9.350394e-01 6.934630e-01 5.143002e-01
2002 4.707214e-02 3.491060e-02 2.589111e-02 1.920190e-02 1.424090e-02
2003 1.303421e-03 9.666697e-04 7.169213e-04 5.316978e-04 3.943286e-04
2004                                                                 
              Nov          Dec
2001 3.814258e-01 2.828808e-01
2002 1.056163e-02 7.832931e-03
2003 2.924501e-04 2.168928e-04
2004                          
checkresiduals(train.res.arima$residuals, lag.max = 12)   # Uncorrelated.


예측

# Final Prediction
res.arima <- as.data.frame(train.res.arima.pred)[,1]
fore      <- as.data.frame(train.lm.trend.season.pred)[,1]
Improved_forecast <- apply(cbind(res.arima,fore), 1,sum)            # 잔차의 예측값과 Training Data의 적합값을 더함NAImproved_forecast
 [1] 2007.362 2047.712 2010.375 2129.821 2188.846 1875.725 2008.410
 [8] 2008.043 2054.023 1819.961 1785.724 2098.129 2108.008 2150.592
[15] 2115.289 2236.620 2297.420 1985.991 2120.308 2121.528 2169.061
[22] 1936.529 1903.802 2217.704 2229.070 2273.133 2239.304 2362.105
[29] 2424.371 2114.407 2250.187 2252.868 2301.862 2070.790 2039.522
[36] 2354.883

Accuracy

Improved_forecast <- ts(Improved_forecast, start=c(2001,4), end=c(2004,3), freq=12)  # Convert as time-series

# Accuracy
acc.Regression <- accuracy(Improved_forecast, test.ts)
acc.Regression
                ME     RMSE     MAE       MPE     MAPE      ACF1
Test set -126.4976 153.2637 131.651 -6.447937 6.695432 0.7054902
         Theil's U
Test set 0.8962032

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = Improved_forecast, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


2. Dynamic Linear Model


모형 적합

# Fit Dynamic linear model (DLM) 
DLM.model <- function(p){
  
  mod <- dlmModPoly(2) +  # local trend linear 
    dlmModSeas(12)
  
  V(mod)            <- exp(p[1])
  diag(W(mod))[1:3] <- exp(p[2:4])
  
  return(mod)  
}

mle1 <- dlmMLE(train.ts, parm = c(0.1,0.1,1,1), build = DLM.model )        # Estimation parameter through MLE. Parameter= Variance of error
ifelse(mle1$convergence==0, print("converge"), print("did not converge") ) # Check Convergence
[1] "converge"
[1] "converge"
DLM.model.fit <- DLM.model(mle1$par)  # Fitting the DLM 
V(DLM.model.fit)
        [,1]
[1,] 1413.91
W(DLM.model.fit)
          [,1]      [,2]     [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
 [1,] 1143.988 0.0000000  0.00000    0    0    0    0    0    0     0
 [2,]    0.000 0.2040261  0.00000    0    0    0    0    0    0     0
 [3,]    0.000 0.0000000 14.47867    0    0    0    0    0    0     0
 [4,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
 [5,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
 [6,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
 [7,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
 [8,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
 [9,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
[10,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
[11,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
[12,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
[13,]    0.000 0.0000000  0.00000    0    0    0    0    0    0     0
      [,11] [,12] [,13]
 [1,]     0     0     0
 [2,]     0     0     0
 [3,]     0     0     0
 [4,]     0     0     0
 [5,]     0     0     0
 [6,]     0     0     0
 [7,]     0     0     0
 [8,]     0     0     0
 [9,]     0     0     0
[10,]     0     0     0
[11,]     0     0     0
[12,]     0     0     0
[13,]     0     0     0

상태 추정

# Estimation for Kalman filtering
filtering <- dlmFilter(train.ts, DLM.model.fit)

# Plot estimation for filtering model
plot(dropFirst(filtering$f), col = "blue", lwd = 2, lty = 2, ylab = "Ridership")
lines(train.ts ,lty = 1, lwd = 2, col = "black") 
legend("bottomleft", legend = c("Data", "Fitted Data"), col = c("black", "blue"), lty = 1:2, lwd = 2)
# Plot for estimated states
par(mfrow=c(1,3))
plot(dropFirst(filtering$m[,1]), ylab = "Level")
plot(dropFirst(filtering$m[,2]), ylab = "Slope")
plot(dropFirst(filtering$m[,3]), ylab = "Seasonality")


정규성 확인

# Residual(Check independence and normality)
par(mfrow=c(1,1))
plot(residuals(filtering, sd = FALSE), ylab="Residual")
abline(h=0)
tsdiag(filtering, main = "Diagnostics for Regression Model")
# Check normality for error
qqnorm(residuals(filtering, sd = FALSE))
qqline(residuals(filtering, sd = FALSE))


예측

# Forecast
forecast_DLM <- dlmForecast(filtering, nAhead = n.test)  # Forecast(filtering model)
forecast_DLM$f
          Jan      Feb      Mar      Apr      May      Jun      Jul
2001                            1996.881 2024.378 1992.103 2100.250
2002 1743.912 1710.032 2013.061 2016.583 2044.080 2011.805 2119.952
2003 1763.614 1729.734 2032.762 2036.285 2063.782 2031.507 2139.654
2004 1783.316 1749.436 2052.464                                    
          Aug      Sep      Oct      Nov      Dec
2001 2148.355 1830.904 1959.455 1953.408 1993.073
2002 2168.057 1850.606 1979.157 1973.110 2012.775
2003 2187.759 1870.308 1998.859 1992.812 2032.477
2004                                             

Accuracy

# Accuracy
acc.DLM <- accuracy(forecast_DLM$f, test.ts)
acc.DLM
               ME     RMSE      MAE       MPE     MAPE      ACF1
Test set 18.02393 67.51251 55.10431 0.8020044 2.784397 0.5948375
         Theil's U
Test set 0.3893913

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = forecast_DLM$f, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


3. Bootstrap and Bagging


모형 적합

set.seed(100)
bagged <- train.ts %>%
  baggedModel(bld.mbb.bootstrap(train.ts, 100),fn = auto.arima)   # Generate 100set Bootstrap Samples and Fitting auto.arima for Each Bootstrap Sample Set 

예측

# Forecast
forecast_bagged <- bagged %>% 
  forecast(h = n.test)  # Auto.arima is more accurate than ets. / Forecaste Each Fitted ARIMA Model by bootstrap sample set. Then, Calculate Mean
forecast_bagged$mean
          Jan      Feb      Mar      Apr      May      Jun      Jul
2001                            2007.388 2043.424 2002.164 2114.229
2002 1782.007 1738.522 2047.303 2048.708 2083.499 2041.520 2153.640
2003 1823.123 1779.574 2090.513 2092.330 2127.087 2085.660 2198.148
2004 1868.835 1825.044 2135.085                                    
          Aug      Sep      Oct      Nov      Dec
2001 2162.860 1847.796 1980.844 1973.089 2012.932
2002 2203.439 1888.618 2021.920 2014.610 2055.572
2003 2248.026 1933.322 2065.527 2059.721 2100.603
2004                                             

Accuracy

# Accuracy
acc.Bagging <- accuracy(forecast_bagged$mean, test.ts)
acc.Bagging
                ME   RMSE      MAE       MPE     MAPE     ACF1
Test set -25.03095 64.975 45.95837 -1.365154 2.373141 0.507273
         Theil's U
Test set 0.3768781

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = forecast_bagged$mean, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


4. Dynamic Harmonic Model


모형 적합

DHR.fit <- auto.arima(train.ts, 
                      xreg = as.matrix(Train.X),      # Include Fourier Terms 
                      seasonal = FALSE)
DHR.fit
Series: train.ts 
Regression with ARIMA(2,1,2) errors 

Coefficients:
          ar1      ar2     ma1      ma2    Month     S1-12      C1-12
      -1.1998  -0.4967  0.2032  -0.4963  43.3345  109.4259  -150.3196
s.e.   0.1349   0.1294  0.1091   0.1353   9.2838   35.5583    12.9651
        S2-12     C2-12
      40.0290  -18.4092
s.e.  18.2969   12.3742

sigma^2 estimated as 9891:  log likelihood=-730.38
AIC=1480.76   AICc=1482.74   BIC=1508.8
checkresiduals(DHR.fit)


    Ljung-Box test

data:  Residuals from Regression with ARIMA(2,1,2) errors
Q* = 95.067, df = 15, p-value = 1.115e-13

Model df: 9.   Total lags used: 24

예측

# Forecast
DHR.forecast <- forecast(DHR.fit, xreg = as.matrix(Test.X))
DHR.forecast$mean
          Jan      Feb      Mar      Apr      May      Jun      Jul
2001                            1957.800 1980.577 2090.017 2020.626
2002 1651.827 1801.555 1912.521 1972.079 2010.833 2046.623 2057.661
2003 1647.319 1804.050 1911.767 1971.744 2011.609 2045.859 2058.193
2004 1647.258 1804.075 1911.768                                    
          Aug      Sep      Oct      Nov      Dec
2001 2048.011 1943.821 1891.313 1907.054 1999.249
2002 2025.131 1952.878 1891.812 1901.957 2005.117
2003 2024.872 1952.924 1891.885 1901.846 2005.213
2004                                             

Accuracy

# Accuracy
acc.DHR <- accuracy(DHR.forecast$mean, test.ts)
acc.DHR
               ME     RMSE      MAE      MPE     MAPE       ACF1
Test set 58.24237 118.5707 97.03389 2.697112 4.885403 0.02801658
         Theil's U
Test set 0.6815818

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = DHR.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


5. STLM


모형 적합

# Ref. https://github.com/robjhyndman/forecast/blob/master/R/mstl.R

STLM.fit <- train.ts %>%
  stlm(method = "arima", xreg = Train.X)            # 시계열을 분해하고 추세 + 오차 성분에 DHR 모형 적합 => 예측 변수에 Fourier TermsSTLM.fit$model
Series: x 
Regression with ARIMA(0,1,1) errors 

Coefficients:
          ma1   Month    S1-12   C1-12    S2-12    C2-12
      -0.4497  0.3528   6.5749  0.1468  -0.8428  -0.1513
s.e.   0.0958  2.2538  11.8652  8.6959   7.0494   6.1778

sigma^2 estimated as 2881:  log likelihood=-656.06
AIC=1326.11   AICc=1327.1   BIC=1345.74

예측

# Forecast
STLM.forecast <- forecast(STLM.fit, h = n.test,     # Trend + Error부분을 예측하고 Seasonal naive method(같은 시즌의 마지막 관측값=예측)를 이용하여 Seasonal 예측하여 더함NA= Test.X)    
STLM.forecast$mean
          Jan      Feb      Mar      Apr      May      Jun      Jul
2001                            1999.378 2030.771 2016.893 2115.862
2002 1722.372 1688.243 1990.391 1999.378 2030.771 2016.893 2115.862
2003 1722.372 1688.243 1990.391 1999.378 2030.771 2016.893 2115.862
2004 1722.372 1688.243 1990.391                                    
          Aug      Sep      Oct      Nov      Dec
2001 2149.749 1831.360 1966.957 1959.655 1990.394
2002 2149.749 1831.360 1966.957 1959.655 1990.394
2003 2149.749 1831.360 1966.957 1959.655 1990.394
2004                                             

Accuracy

# Accuracy
acc.STLM <- accuracy(STLM.forecast$mean, test.ts)
acc.STLM
               ME     RMSE      MAE      MPE     MAPE     ACF1
Test set 38.04171 83.77231 70.30234 1.821433 3.555029 0.694755
         Theil's U
Test set 0.4885911

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = STLM.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


6. Baysian Structural Time Series Model


모형 적합

Train.Data <- data.frame("y"= train.ts, Train.X[,1])     # Excluding Fourier Terms

ss <- list()
# Local Linear Trend
ss <- bsts::AddLocalLinearTrend(ss, train.ts)
# Seasonal
ss <- bsts::AddSeasonal(ss, train.ts, nseasons = 12, season.duration = 1) # cycle = season.duration(how many time points each season) * nseasons

BSTS.fit <- bsts(y ~., state.specification = ss, data = Train.Data, niter = 1000, seed=100)  # niter : MCMC 반복
=-=-=-=-= Iteration 0 Thu Nov 18 00:58:30 2021
 =-=-=-=-=
=-=-=-=-= Iteration 100 Thu Nov 18 00:58:31 2021
 =-=-=-=-=
=-=-=-=-= Iteration 200 Thu Nov 18 00:58:31 2021
 =-=-=-=-=
=-=-=-=-= Iteration 300 Thu Nov 18 00:58:31 2021
 =-=-=-=-=
=-=-=-=-= Iteration 400 Thu Nov 18 00:58:32 2021
 =-=-=-=-=
=-=-=-=-= Iteration 500 Thu Nov 18 00:58:32 2021
 =-=-=-=-=
=-=-=-=-= Iteration 600 Thu Nov 18 00:58:32 2021
 =-=-=-=-=
=-=-=-=-= Iteration 700 Thu Nov 18 00:58:33 2021
 =-=-=-=-=
=-=-=-=-= Iteration 800 Thu Nov 18 00:58:33 2021
 =-=-=-=-=
=-=-=-=-= Iteration 900 Thu Nov 18 00:58:33 2021
 =-=-=-=-=
summary(BSTS.fit)
$residual.sd
[1] 38.38387

$prediction.sd
[1] 86.37815

$rsquare
[1] 0.9424972

$relative.gof
[1] 0.7358398

$size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.00000 0.07092 0.00000 1.00000 

$coefficients
                  mean       sd mean.inc   sd.inc   inc.prob
Train.X...1. 0.1077952 0.539501 1.519912 1.414357 0.07092199
(Intercept)  0.0000000 0.000000 0.000000 0.000000 0.00000000

예측

# Forecast
burn <- SuggestBurn(0.1, BSTS.fit)
BSTS.forecast <- predict(BSTS.fit, horizon = n.test,             # horizon : the number of prediction
                         burn = burn, newdata = Test.X[,1],      # newdata : 예측변수를 포함하는 변수
                         quantiles = c(0.025, 0.975))  
BSTS.forecast$mean
 [1] 2008.649 2033.859 2005.471 2115.656 2159.608 1849.589 1985.606
 [8] 1980.895 2025.518 1771.432 1738.900 2042.580 2046.658 2077.231
[15] 2049.163 2155.931 2204.486 1889.507 2028.428 2029.719 2066.013
[22] 1816.823 1787.502 2091.551 2097.854 2122.777 2100.560 2208.762
[29] 2256.867 1945.645 2080.120 2076.238 2118.645 1870.599 1843.099
[36] 2145.638

Accuracy

# Accuracy
acc.BSTS <- accuracy(BSTS.forecast$mean, test.ts)
acc.BSTS
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -29.77804 66.58763 45.96603 -1.599523 2.379736 0.5265906
         Theil's U
Test set 0.3875899

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = BSTS.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


7. TBATS


모형 적합

cl <- parallel::makeCluster(detectCores(), setup_timeout = 0.5)
TBATS.fit <- train.ts %>%
  tbats(use.box.cox = FALSE,
        use.trend = TRUE,
        use.damped.trend = TRUE,
        use.parallel = TRUE,
        num.cores = cl)

summary(TBATS.fit)
                  Length Class  Mode     
lambda               0   -none- NULL     
alpha                1   -none- numeric  
beta                 1   -none- numeric  
damping.parameter    1   -none- numeric  
gamma.one.values     1   -none- numeric  
gamma.two.values     1   -none- numeric  
ar.coefficients      0   -none- NULL     
ma.coefficients      0   -none- NULL     
likelihood           1   -none- numeric  
optim.return.code    1   -none- numeric  
variance             1   -none- numeric  
AIC                  1   -none- numeric  
parameters           2   -none- list     
seed.states         12   -none- numeric  
fitted.values      123   ts     numeric  
errors             123   ts     numeric  
x                 1476   -none- numeric  
seasonal.periods     1   -none- numeric  
k.vector             1   -none- numeric  
y                  123   ts     numeric  
p                    1   -none- numeric  
q                    1   -none- numeric  
call                 7   -none- call     
series               1   -none- character
method               1   -none- character

예측

# Forecast
TBATS.forecast <- forecast(TBATS.fit, h=n.test)
TBATS.forecast$mean
          Jan      Feb      Mar      Apr      May      Jun      Jul
2001                            1978.995 2028.484 1959.201 2089.790
2002 1721.923 1658.583 1979.689 1966.634 2018.589 1951.281 2083.449
2003 1720.255 1657.248 1978.620 1965.779 2017.904 1950.732 2083.010
2004 1720.140 1657.155 1978.546                                    
          Aug      Sep      Oct      Nov      Dec
2001 2116.995 1814.845 1914.140 1924.798 1938.481
2002 2111.919 1810.782 1910.888 1922.195 1936.397
2003 2111.568 1810.501 1910.663 1922.015 1936.252
2004                                             

Accuracy

# Accuracy
acc.TBATS <- accuracy(TBATS.forecast$mean, test.ts)
acc.TBATS
               ME    RMSE      MAE      MPE    MAPE     ACF1
Test set 69.36452 103.114 88.31896 3.379683 4.42057 0.556916
         Theil's U
Test set 0.5999032

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = TBATS.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


예측 비교1

Line Plot

# 1. Line Plot
Pred.Data <- data.frame("Date" = tail(Amtrak.data$Month, n.test),
                        "Observation" = test.ts,
                        "Regression" = Improved_forecast,
                        "DLM" = c(forecast_DLM$f),
                        "Bootstrap" = forecast_bagged$mean,
                        "DHR" = DHR.forecast$mean,
                        "STLM" = STLM.forecast$mean,
                        "BSTS" = BSTS.forecast$mean,
                        "TBATS" = TBATS.forecast$mean)


Pred.Models <- reshape2::melt(Pred.Data, id.vars="Date",
                              variable.name="Type",
                              value.name='y')

models <- unique(Pred.Models$Type)

lcols <- c("#000000", "#66CC99", "#ae5d8b", "#c1c084", "#d38b72", "#dc143c", "#00498c", "#9999CC")

ggplot(Pred.Models, aes(x=Date, y=y, group=Type)) +
  geom_line(aes(color=Type), size=1, linetype="solid") +
  labs(x="Date", y="Ridership") +                # y="log(Charging demand)"
  scale_color_manual(values = lcols, name = 'Models', labels = models) +
  #scale_x_datetime(date_breaks= "24 hour", date_labels = paste("%Y- %m-%d %H", "h", sep="")) +  # %Y- %m-%d %H:%M:%S"
  theme_bw() +
  theme(axis.text.y=element_text(size=16),
        axis.title=element_text(size=20),
        axis.text.x=element_text(angle=30, hjust=1, size=12),
        legend.title = element_text(size=18),
        legend.text = element_text(size=16)) 


Scatter Plot

# 2. Scatter Plot
Pred.Models2 <- reshape2::melt(Pred.Data[,-1], id.vars="Observation",
                              variable.name="Type",
                              value.name='Pred')

ggplot(Pred.Models2, aes(x=Observation, y=Pred)) + 
  geom_point(color="#d38b72") +
  geom_abline(intercept = 0, slope = 1, size=1, colour="#9999CC", linetype="dashed") +
  facet_wrap( ~ Type, ncol=2) +
  labs(x="Observation", y="Prediction", color = "") +
  theme_bw() +
  theme(axis.title=element_text(size=20),
        axis.text=element_text(size=15),
        strip.text.x = element_text(size=18, face="bold")) 


머신 러닝 기법


분해

# Ref. https://petolau.github.io/Regression-trees-for-forecasting-time-series-in-R/

decomp.ts <- stl(train.ts, , s.window = "periodic", robust = TRUE)$time.series 
# decomp.ts <- mstl(Power.msts, s.window = "periodic", robust = TRUE) # 다중 계절성인 경우NA# Target without Trend
Target <- decomp.ts %>% 
  data.frame %>%
  rowwise() %>%                                        # 행별로 작업  dplyr::mutate(y=sum( seasonal, remainder )) %>%      # Target = Season + Remainder => Detrend
  dplyr::select(y)


Train.Data <- cbind(Target, Train.X)

추세

trend.part  <- data.frame(decomp.ts)$trend %>% # Only Trend of Training Data
  ts()

# Fitting ARIMA for Trend 
trend.fit.arima   <- auto.arima(trend.part)

# Forecast 
trend.arima.pred  <- forecast(trend.fit.arima, n.test)
trend.arima.pred$mean 
Time Series:
Start = 124 
End = 159 
Frequency = 1 
 [1] 1963.065 1966.790 1970.428 1974.067 1977.679 1981.291 1984.895
 [8] 1988.498 1992.099 1995.700 1999.301 2002.901 2006.501 2010.101
[15] 2013.701 2017.301 2020.901 2024.501 2028.101 2031.701 2035.301
[22] 2038.901 2042.501 2046.101 2049.701 2053.301 2056.901 2060.501
[29] 2064.101 2067.701 2071.301 2074.901 2078.501 2082.101 2085.701
[36] 2089.301

1. Neural Network Model


모형 적합

set.seed(100)
neural_model <- nnetar(ts(Target, start=c(1991,1), end=c(2001,3), freq = 12),    # Or train.ts
                       xreg = Train.X,
                       repeats = 200, 
                       lambda = "auto") 

neural_model
Series: ts(Target, start = c(1991, 1), end = c(2001, 3), freq = 12) 
Model:  NNAR(1,1,4)[12] 
Call:   nnetar(y = ts(Target, start = c(1991, 1), end = c(2001, 3), freq = 12), 
    repeats = 200, xreg = Train.X, lambda = "auto")

Average of 200 networks, each of which is
a 7-4-1 network with 37 weights
options were - linear output units 

sigma^2 estimated as 2163

예측

# Forecast for Detrended
forecast_neu <- forecast(neural_model, PI = TRUE,  # PI : Confidencd Interval
                         h = n.test,
                         xreg = Test.X,
                         npaths = 100, level = 95) # Npaths : How many simulation

# Final Prediction (Detrended + Trend)
NNAR.forecast <- c(forecast_neu$mean) + c(trend.arima.pred$mean) 

Accuracy

# Accuracy
acc.NNAR <- accuracy(NNAR.forecast, test.ts)
acc.NNAR
                ME     RMSE      MAE     MPE     MAPE      ACF1
Test set -30.87089 74.79254 47.02293 -1.6211 2.447256 0.5835289
         Theil's U
Test set 0.4346942

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = NNAR.forecast, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


2. Random Forest


모형 적합

set.seed(100)
fitControl <- trainControl(method = "adaptive_cv",   # cv, repeatedcv
                           number = 5,
                           repeats = 5,
                           adaptive = list(min = 5,
                                           alpha = 0.05,
                                           method = "BT",
                                           complete = TRUE),
                           search = "random",
                           allowParallel = TRUE) 

RF <- function(train, tuneLength, ntree = 500, nodesize = 5){
  
  set.seed(100)                                        # seed 고정 For Cross Validation
  caret.rf <- caret::train(y~., data = train, 
                           method = "parRF",           # Tune Parameter : mtry
                           trControl = fitControl,
                           tuneLength = tuneLength,   
                           ntree = ntree,             
                           nodesize = nodesize,        # nodesize : Terminal Node의 최소 크기NA= TRUE)   
  
  return(caret.rf)
  
}

RF.Caret <- RF(Train.Data, 
               tuneLength = 2,     # tuneLength (탐색할 후보 모수 갯수)
               ntree = 100)        # ntree : 생성할 Tree 수

RF.Caret
Parallel Random Forest 

123 samples
  5 predictor

No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ... 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared   MAE       Resamples
  1     52.87737  0.8892022  40.21114  25       
  3     50.21355  0.8949560  38.53876  25       

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 3.
RF.Caret$finalModel

Call:
 randomForest(x = "x", y = "y", ntree = 13, mtry = 3L, nodesize = 5,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 104
No. of variables tried at each split: 3
RF.Caret$finalModel$tuneValue
  mtry
2    3

예측

# Final Prediction (Detrended + Trend)
Pred.RF <- predict(RF.Caret, Test.X) + trend.arima.pred$mean  

Accuracy

# Accuracy
acc_RF  <- accuracy(c(Pred.RF), test.ts)
acc_RF
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -29.23913 71.90871 46.82079 -1.546831 2.423727 0.6008866
         Theil's U
Test set 0.4190109

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = Pred.RF, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


3. XGBoost


모형 적합

set.seed(100)
fitControl <- trainControl(method = "adaptive_cv",   # cv, repeatedcv
                           number = 5,
                           repeats = 5,
                           adaptive = list(min = 5,
                                           alpha = 0.05,
                                           method = "BT",
                                           complete = TRUE),
                           search = "random",
                           allowParallel = TRUE) 

XGBoost <- function(train, tuneLength){
  
  set.seed(100)                                         # seed 고정 For Cross Validation
  caret.xgb <- caret::train(y~., data = train, 
                            method = "xgbTree",          
                            trControl = fitControl,
                            # objective = "reg:squarederror", # error(The following parameters were provided multiple times)
                            tuneLength = tuneLength     # tuneLength (탐색할 후보 모수 갯수)
  )   
  
  return(caret.xgb)
  
}

XGB.Caret <- XGBoost(Train.Data, 2)
XGB.Caret
eXtreme Gradient Boosting 

123 samples
  5 predictor

No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ... 
Resampling results across tuning parameters:

  eta        max_depth  gamma     colsample_bytree  min_child_weight
  0.2228220  9          1.702621  0.6528662         3               
  0.4876292  3          5.465586  0.5499986         5               
  subsample  nrounds  RMSE      Rsquared   MAE       Resamples
  0.7517663  624      50.21422  0.8931412  38.60326  25       
  0.8219133  358      50.06911  0.8961369  38.41744  25       

RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 358, max_depth
 = 3, eta = 0.4876292, gamma = 5.465586, colsample_bytree
 = 0.5499986, min_child_weight = 5 and subsample = 0.8219133.
XGB.Caret$finalModel
##### xgb.Booster
raw: 181.9 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
  eta = "0.48762916797353", max_depth = "3", gamma = "5.46558595029637", colsample_bytree = "0.549998590815812", min_child_weight = "5", subsample = "0.82191331172362", objective = "reg:linear", silent = "1"
xgb.attributes:
  niter
callbacks:
  cb.print.evaluation(period = print_every_n)
# of features: 5 
niter: 358
nfeatures : 5 
xNames : Month `S1-12` `C1-12` `S2-12` `C2-12` 
problemType : Regression 
tuneValue :
      nrounds max_depth       eta    gamma colsample_bytree
2     358         3 0.4876292 5.465586        0.5499986
  min_child_weight subsample
2                5 0.8219133
obsLevels : NA 
param :
    list()
XGB.Caret$finalModel$tuneValue
  nrounds max_depth       eta    gamma colsample_bytree
2     358         3 0.4876292 5.465586        0.5499986
  min_child_weight subsample
2                5 0.8219133

예측

# Final Prediction (Detrended + Trend)
Pred.XGB  <- predict(XGB.Caret, Test.X) + trend.arima.pred$mean

Accuracy

# Accuracy
acc_XGB   <- accuracy(c(Pred.XGB), test.ts)
acc_XGB
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -27.27012 68.20048 47.95543 -1.475506 2.477208 0.5023489
         Theil's U
Test set 0.3950394

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = Pred.XGB, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


4. Stacking


모형 적합

# Ref. https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html
#      https://github.com/zachmayer/caretEnsemble/blob/master/R/caretStack.R


# 1. Modeling for Stacking (Declare Individual Model)
set.seed(100)
fitControl <- trainControl(method = "repeatedcv",        # adaptive_cv
                           number = 5,
                           repeats = 5,
                           # adaptive = list(min = 5,
                           #                 alpha = 0.05,
                           #                 method = "BT",
                           #                 complete = TRUE),
                           # search = "random",            # grid
                           savePredictions = "final",      # 최적 모수에 대한 예측 저장NA# classProbs = TRUE,            # 각 클래스에 대한 확률 저장(Classification))
                           index = createResample(Train.Data$y, 1),  # index : 각 resapling에 대한 요소, Training에 사용되는 행 번호/ createResample : 붓스트랩스트랩
                           allowParallel = TRUE) 


# 원본 Training Data에 학습시킨 Hyperparameter 결과alg_tune_list <- list(                            # Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
  parRF = caretModelSpec(method="parRF",
                         importance = TRUE,
                         nodeside = 5,
                         tuneGrid = expand.grid(mtry=RF.Caret$finalModel$tuneValue$mtry)),
  xgbTree = caretModelSpec(method="xgbTree",
                           tuneGrid = expand.grid(nrounds = XGB.Caret$finalModel$tuneValue$nrounds,
                                                  max_depth = XGB.Caret$finalModel$tuneValue$max_depth,
                                                  eta = XGB.Caret$finalModel$tuneValue$eta,
                                                  gamma = XGB.Caret$finalModel$tuneValue$gamma,
                                                  colsample_bytree = XGB.Caret$finalModel$tuneValue$colsample_bytree,
                                                  min_child_weight = XGB.Caret$finalModel$tuneValue$min_child_weight,
                                                  subsample = XGB.Caret$finalModel$tuneValue$subsample)))

set.seed(100)
multi_mod <- caretList(y~., data = Train.Data, trControl = fitControl, 
                       tuneList = alg_tune_list)  # search = "grid"     


multi_mod$parRF
Parallel Random Forest 

123 samples
  5 predictor

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 123 
Resampling results:

  RMSE      Rsquared   MAE    
  55.73908  0.8368916  42.4932

Tuning parameter 'mtry' was held constant at a value of 3
multi_mod$xgbTree
eXtreme Gradient Boosting 

123 samples
  5 predictor

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 123 
Resampling results:

  RMSE      Rsquared  MAE     
  58.30165  0.827723  45.50672

Tuning parameter 'nrounds' was held constant at a value of 358
 held constant at a value of 5
Tuning parameter 'subsample' was
 held constant at a value of 0.8219133
# 2. Stacking (개별 모형들의 예측값을 결합한 Data를 Training data로 쓰는 Final Model))

set.seed(100)
stackControl <- trainControl(method = "adaptive_cv",
                             number = 5,
                             repeats = 5,
                             adaptive = list(min = 5,
                                             alpha = 0.05,
                                             method = "BT",
                                             complete = TRUE),
                             search = "random",
                             allowParallel = TRUE) 

set.seed(100)
stack.xgb <- caretStack(multi_mod, method = "xgbTree",  # Final Model
                        trControl = stackControl, 
                        tuneLength = 2)                 # 모수 후보 갯수stack.xgb
A xgbTree ensemble of 2 base models: parRF, xgbTree

Ensemble results:
eXtreme Gradient Boosting 

38 samples
 2 predictor

No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 30, 31, 31, 30, 30, 30, ... 
Resampling results across tuning parameters:

  eta        max_depth  gamma     colsample_bytree  min_child_weight
  0.2228220  9          1.702621  0.6528662         3               
  0.4876292  3          5.465586  0.5499986         5               
  subsample  nrounds  RMSE      Rsquared  MAE       Resamples
  0.7517663  624      48.62987  0.929678  38.75318  25       
  0.8219133  358      57.27670  0.924131  43.13093  25       

RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 624, max_depth
 = 9, eta = 0.222822, gamma = 1.702621, colsample_bytree =
 0.6528662, min_child_weight = 3 and subsample = 0.7517663.
stack.xgb$ens_model$finalModel
##### xgb.Booster
raw: 357.5 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
  eta = "0.222822001739405", max_depth = "9", gamma = "1.70262051047757", colsample_bytree = "0.652866207249463", min_child_weight = "3", subsample = "0.751766284287442", objective = "reg:linear", silent = "1"
xgb.attributes:
  niter
callbacks:
  cb.print.evaluation(period = print_every_n)
# of features: 2 
niter: 624
nfeatures : 2 
xNames : parRF xgbTree 
problemType : Regression 
tuneValue :
      nrounds max_depth      eta    gamma colsample_bytree
1     624         9 0.222822 1.702621        0.6528662
  min_child_weight subsample
1                3 0.7517663
obsLevels : NA 
param :
    list()
stack.xgb$ens_model$finalModel$tuneValue
  nrounds max_depth      eta    gamma colsample_bytree
1     624         9 0.222822 1.702621        0.6528662
  min_child_weight subsample
1                3 0.7517663

예측

# Final Prediction (Detrended + Trend)
stack.Pred.XGB <- predict(stack.xgb, Test.X) + trend.arima.pred$mean

Accuracy

# Accuracy
acc_stack.XGB  <- accuracy(c(stack.Pred.XGB), test.ts)
acc_stack.XGB
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -29.89315 88.01831 65.54621 -1.617441 3.344385 0.3068341
         Theil's U
Test set  0.512367

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = stack.Pred.XGB, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


5. Stacking with GLM


모형 적합

# Ref. https://github.com/zachmayer/caretEnsemble/blob/master/R/caretEnsemble.R

set.seed(100)
greedyEnsemble <- caretEnsemble(multi_mod, trControl = trainControl(method = "cv", number=5))
greedyEnsemble
A glm ensemble of 2 base models: parRF, xgbTree

Ensemble results:
Generalized Linear Model 

38 samples
 2 predictor

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 30, 31, 31, 30, 30 
Resampling results:

  RMSE      Rsquared   MAE     
  58.25038  0.8224663  45.03724
greedyEnsemble$ens_model$finalModel

Call:  NULL

Coefficients:
(Intercept)        parRF      xgbTree  
   -8.46746      0.93423      0.02958  

Degrees of Freedom: 37 Total (i.e. Null);  35 Residual
Null Deviance:      703200 
Residual Deviance: 114700   AIC: 420.3
summary(greedyEnsemble)
The following models were ensembled: parRF, xgbTree 
They were weighted: 
-8.4675 0.9342 0.0296
The resulting RMSE is: 58.2504
The fit for each individual model on the RMSE is: 
  method     RMSE RMSESD
   parRF 55.73908     NA
 xgbTree 58.30165     NA

예측

# Final Prediction (Detrended + Trend)
stack.Pred.GLM <- predict(greedyEnsemble, Test.X) + trend.arima.pred$mean

Accuracy

# Accuracy
acc_stack.GLM <- accuracy(c(stack.Pred.GLM), test.ts)
acc_stack.GLM
                ME    RMSE      MAE      MPE    MAPE      ACF1
Test set -19.94278 66.9187 44.73519 -1.09601 2.30595 0.5677311
         Theil's U
Test set  0.388657

예측 그래프

# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
  geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) + 
  geom_line(aes(y = stack.Pred.GLM, colour = "Prediction", linetype = "Prediction")) +
  scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
  scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
  theme_bw()


예측 비교2

Line Plot

# 1. Line Plot
Pred.Data3 <- data.frame("Date" = tail(Amtrak.data$Month, n.test),
                         "Observation" = test.ts,
                         "NNAR" = NNAR.forecast,
                         "RF" = c(Pred.RF),
                         "XGBoost" = c(Pred.XGB),
                         "Stack.XGBoost" = c(stack.Pred.XGB),
                         "Stack.GLM" = c(stack.Pred.GLM))


Pred.Models3 <- reshape2::melt(Pred.Data3, id.vars="Date",
                               variable.name="Type",
                               value.name='y')

models2 <- unique(Pred.Models3$Type)

lcols <- c("#000000", "#66CC99", "#ae5d8b", "#c1c084", "#d38b72", "#dc143c", "#00498c", "#9999CC")

ggplot(Pred.Models3, aes(x=Date, y=y, group=Type)) +
  geom_line(aes(color=Type), size=1, linetype="solid") +
  labs(x="Date", y="Ridership") +                # y="log(Charging demand)"
  scale_color_manual(values = lcols, name = 'Models', labels = models2) +
  #scale_x_datetime(date_breaks= "24 hour", date_labels = paste("%Y- %m-%d %H", "h", sep="")) +  # %Y- %m-%d %H:%M:%S"
  theme_bw() +
  theme(axis.text.y=element_text(size=16),
        axis.title=element_text(size=20),
        axis.text.x=element_text(angle=30, hjust=1, size=12),
        legend.title = element_text(size=18),
        legend.text = element_text(size=16))  


Scatter Plot

# 2. Scatter Plot
Pred.Models4 <- reshape2::melt(Pred.Data3[,-1], id.vars="Observation",
                               variable.name="Type",
                               value.name='Pred')

ggplot(Pred.Models4, aes(x=Observation, y=Pred)) + 
  geom_point(color="#d38b72") +
  geom_abline(intercept = 0, slope = 1, size=1, colour="#9999CC", linetype="dashed") +
  facet_wrap( ~ Type, ncol=2) +
  labs(x="Observation", y="Prediction", color = "") +
  theme_bw() +
  theme(axis.title=element_text(size=20),
        axis.text=element_text(size=15),
        strip.text.x = element_text(size=18, face="bold")) 

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".