Time Series Analysis Method Summary
pacman::p_load( "dplyr", "xts",
"forecast", "dlm", "bsts",
"caret", "caretEnsemble",
"ggplot2")
library(doParallel)
library(parallel)
# cl <- makePSOCKcluster(detectCores())
# clusterEvalQ(cl, library(foreach))
# registerDoParallel(cores=cl)
registerDoParallel(cores=detectCores())
# Data 불러오기 ----------------------------------------------------------------
# In Mac
# guess_encoding("Amtrak.csv")
# Amtrak.data <- read.csv("Amtrak.csv", fileEncoding="EUC-KR")
Amtrak.data <- read.csv("C:/Users/User/Desktop/Amtrak.csv")
Amtrak.data$Month <- as.Date(Amtrak.data$Month, format = "%d/%m/%Y") # For ggplot
ridership.ts <- ts(Amtrak.data$Ridership, start=c(1991,1), end=c(2004,3), freq=12)
# 1. Fourier Term
FT <- fourier(train.ts, K=2) # K : sine, cosine 쌍의 개수/시계열 데이터의 계절 주기가 2개 이상일 때, K는 계절 주기 수만큼 필요NAFT.Test <- fourier(train.ts, K=2, h=n.test)
# 2. Month
xts(ridership.ts, order = as.Date(ridership.ts))
[,1]
1991-01-01 1708.917
1991-02-01 1620.586
1991-03-01 1972.715
1991-04-01 1811.665
1991-05-01 1974.964
1991-06-01 1862.356
1991-07-01 1939.860
1991-08-01 2013.264
1991-09-01 1595.657
1991-10-01 1724.924
1991-11-01 1675.667
1991-12-01 1813.863
1992-01-01 1614.827
1992-02-01 1557.088
1992-03-01 1891.223
1992-04-01 1955.981
1992-05-01 1884.714
1992-06-01 1623.042
1992-07-01 1903.309
1992-08-01 1996.712
1992-09-01 1703.897
1992-10-01 1810.000
1992-11-01 1861.601
1992-12-01 1875.122
1993-01-01 1705.259
1993-02-01 1618.535
1993-03-01 1836.709
1993-04-01 1957.043
1993-05-01 1917.185
1993-06-01 1882.398
1993-07-01 1933.009
1993-08-01 1996.167
1993-09-01 1672.841
1993-10-01 1752.827
1993-11-01 1720.377
1993-12-01 1734.292
1994-01-01 1563.365
1994-02-01 1573.959
1994-03-01 1902.639
1994-04-01 1833.888
1994-05-01 1831.049
1994-06-01 1775.755
1994-07-01 1867.508
1994-08-01 1906.608
1994-09-01 1685.632
1994-10-01 1778.546
1994-11-01 1775.995
1994-12-01 1783.350
1995-01-01 1548.415
1995-02-01 1496.925
1995-03-01 1798.316
1995-04-01 1732.895
1995-05-01 1772.345
1995-06-01 1761.207
1995-07-01 1791.655
1995-08-01 1874.820
1995-09-01 1571.309
1995-10-01 1646.948
1995-11-01 1672.631
1995-12-01 1656.845
1996-01-01 1381.758
1996-02-01 1360.852
1996-03-01 1558.575
1996-04-01 1608.420
1996-05-01 1696.696
1996-06-01 1693.183
1996-07-01 1835.516
1996-08-01 1942.573
1996-09-01 1551.401
1996-10-01 1686.508
1996-11-01 1576.204
1996-12-01 1700.433
1997-01-01 1396.588
1997-02-01 1371.690
1997-03-01 1707.522
1997-04-01 1654.604
1997-05-01 1762.903
1997-06-01 1775.800
1997-07-01 1934.219
1997-08-01 2008.055
1997-09-01 1615.924
1997-10-01 1773.910
1997-11-01 1732.368
1997-12-01 1796.626
1998-01-01 1570.330
1998-02-01 1412.691
1998-03-01 1754.641
1998-04-01 1824.932
1998-05-01 1843.289
1998-06-01 1825.964
1998-07-01 1968.172
1998-08-01 1921.645
1998-09-01 1669.597
1998-10-01 1791.474
1998-11-01 1816.714
1998-12-01 1846.754
1999-01-01 1599.427
1999-02-01 1548.804
1999-03-01 1832.333
1999-04-01 1839.720
1999-05-01 1846.498
1999-06-01 1864.852
1999-07-01 1965.743
1999-08-01 1949.002
1999-09-01 1607.373
1999-10-01 1803.664
1999-11-01 1850.309
1999-12-01 1836.435
2000-01-01 1541.660
2000-02-01 1616.928
2000-03-01 1919.538
2000-04-01 1971.493
2000-05-01 1992.301
2000-06-01 2009.763
2000-07-01 2053.996
2000-08-01 2097.471
2000-09-01 1823.706
2000-10-01 1976.997
2000-11-01 1981.408
2000-12-01 2000.153
2001-01-01 1683.148
2001-02-01 1663.404
2001-03-01 2007.928
2001-04-01 2023.792
2001-05-01 2047.008
2001-06-01 2072.913
2001-07-01 2126.717
2001-08-01 2202.638
2001-09-01 1707.693
2001-10-01 1950.716
2001-11-01 1973.614
2001-12-01 1984.729
2002-01-01 1759.629
2002-02-01 1770.595
2002-03-01 2019.912
2002-04-01 2048.398
2002-05-01 2068.763
2002-06-01 1994.267
2002-07-01 2075.258
2002-08-01 2026.560
2002-09-01 1734.155
2002-10-01 1916.771
2002-11-01 1858.345
2002-12-01 1996.352
2003-01-01 1778.033
2003-02-01 1749.489
2003-03-01 2066.466
2003-04-01 2098.899
2003-05-01 2104.911
2003-06-01 2129.671
2003-07-01 2223.349
2003-08-01 2174.360
2003-09-01 1931.406
2003-10-01 2121.470
2003-11-01 2076.054
2003-12-01 2140.677
2004-01-01 1831.508
2004-02-01 1838.006
2004-03-01 2132.446
forecast
package에 있는 tslm()
함수를 이용하여 추세와 계절성이 동시에 존재하는 회귀모형을 적합시켜보았다.# Fit Trend + Seasonality Model
train.lm.trend.season <- tslm(train.ts ~ trend + I(trend^2) + season)
summary(train.lm.trend.season)
Call:
tslm(formula = train.ts ~ trend + I(trend^2) + season)
Residuals:
Min 1Q Median 3Q Max
-213.775 -39.363 9.711 42.422 152.187
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.697e+03 2.768e+01 61.318 < 2e-16 ***
trend -7.156e+00 7.293e-01 -9.812 < 2e-16 ***
I(trend^2) 6.074e-02 5.698e-03 10.660 < 2e-16 ***
season2 -4.325e+01 3.024e+01 -1.430 0.15556
season3 2.600e+02 3.024e+01 8.598 6.60e-14 ***
season4 2.606e+02 3.102e+01 8.401 1.83e-13 ***
season5 2.938e+02 3.102e+01 9.471 6.89e-16 ***
season6 2.490e+02 3.102e+01 8.026 1.26e-12 ***
season7 3.606e+02 3.102e+01 11.626 < 2e-16 ***
season8 4.117e+02 3.102e+01 13.270 < 2e-16 ***
season9 9.032e+01 3.102e+01 2.911 0.00437 **
season10 2.146e+02 3.102e+01 6.917 3.29e-10 ***
season11 2.057e+02 3.103e+01 6.629 1.34e-09 ***
season12 2.429e+02 3.103e+01 7.829 3.44e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 70.92 on 109 degrees of freedom
Multiple R-squared: 0.8246, Adjusted R-squared: 0.8037
F-statistic: 39.42 on 13 and 109 DF, p-value: < 2.2e-16
# Forecast
train.lm.trend.season.pred <- forecast(train.lm.trend.season, h=n.test, level=0)
train.lm.trend.season.pred
Point Forecast Lo 0 Hi 0
Apr 2001 2004.271 2004.271 2004.271
May 2001 2045.419 2045.419 2045.419
Jun 2001 2008.675 2008.675 2008.675
Jul 2001 2128.560 2128.560 2128.560
Aug 2001 2187.911 2187.911 2187.911
Sep 2001 1875.032 1875.032 1875.032
Oct 2001 2007.896 2007.896 2007.896
Nov 2001 2007.662 2007.662 2007.662
Dec 2001 2053.740 2053.740 2053.740
Jan 2002 1819.752 1819.752 1819.752
Feb 2002 1785.569 1785.569 1785.569
Mar 2002 2098.014 2098.014 2098.014
Apr 2002 2107.922 2107.922 2107.922
May 2002 2150.528 2150.528 2150.528
Jun 2002 2115.242 2115.242 2115.242
Jul 2002 2236.585 2236.585 2236.585
Aug 2002 2297.394 2297.394 2297.394
Sep 2002 1985.972 1985.972 1985.972
Oct 2002 2120.294 2120.294 2120.294
Nov 2002 2121.518 2121.518 2121.518
Dec 2002 2169.054 2169.054 2169.054
Jan 2003 1936.523 1936.523 1936.523
Feb 2003 1903.798 1903.798 1903.798
Mar 2003 2217.701 2217.701 2217.701
Apr 2003 2229.067 2229.067 2229.067
May 2003 2273.131 2273.131 2273.131
Jun 2003 2239.303 2239.303 2239.303
Jul 2003 2362.104 2362.104 2362.104
Aug 2003 2424.371 2424.371 2424.371
Sep 2003 2114.406 2114.406 2114.406
Oct 2003 2250.186 2250.186 2250.186
Nov 2003 2252.868 2252.868 2252.868
Dec 2003 2301.862 2301.862 2301.862
Jan 2004 2070.789 2070.789 2070.789
Feb 2004 2039.522 2039.522 2039.522
Mar 2004 2354.883 2354.883 2354.883
checkresiduals(train.lm.trend.season)
Breusch-Godfrey test for serial correlation of order up to 24
data: Residuals from Linear regression model
LM test = 57.142, df = 24, p-value = 0.0001599
잔차에 회귀모형을 적합
시킴으로써 예측값을 향상시킬 수 있다.
# Fit ARIMA Model to training residuals
train.res.arima <- auto.arima(train.lm.trend.season$residuals)
summary(train.res.arima)
Series: train.lm.trend.season$residuals
ARIMA(1,0,1) with zero mean
Coefficients:
ar1 ma1
0.7416 -0.2286
s.e. 0.0972 0.1448
sigma^2 estimated as 2822: log likelihood=-662.39
AIC=1330.77 AICc=1330.97 BIC=1339.21
Training set error measures:
ME RMSE MAE MPE MAPE
Training set -0.01267266 52.68668 40.29069 131.2931 202.8981
MASE ACF1
Training set 0.5787867 0.005531221
train.res.arima.pred <- forecast(train.res.arima, h=n.test, level=0)
train.res.arima.pred$mean
Jan Feb Mar Apr May
2001 3.090696e+00 2.292185e+00
2002 2.097959e-01 1.555931e-01 1.153941e-01 8.558095e-02 6.347029e-02
2003 5.809218e-03 4.308351e-03 3.195248e-03 2.369725e-03 1.757484e-03
2004 1.608565e-04 1.192977e-04 8.847596e-05
Jun Jul Aug Sep Oct
2001 1.699977e+00 1.260772e+00 9.350394e-01 6.934630e-01 5.143002e-01
2002 4.707214e-02 3.491060e-02 2.589111e-02 1.920190e-02 1.424090e-02
2003 1.303421e-03 9.666697e-04 7.169213e-04 5.316978e-04 3.943286e-04
2004
Nov Dec
2001 3.814258e-01 2.828808e-01
2002 1.056163e-02 7.832931e-03
2003 2.924501e-04 2.168928e-04
2004
checkresiduals(train.res.arima$residuals, lag.max = 12) # Uncorrelated.
# Final Prediction
res.arima <- as.data.frame(train.res.arima.pred)[,1]
fore <- as.data.frame(train.lm.trend.season.pred)[,1]
Improved_forecast <- apply(cbind(res.arima,fore), 1,sum) # 잔차의 예측값과 Training Data의 적합값을 더함NAImproved_forecast
[1] 2007.362 2047.712 2010.375 2129.821 2188.846 1875.725 2008.410
[8] 2008.043 2054.023 1819.961 1785.724 2098.129 2108.008 2150.592
[15] 2115.289 2236.620 2297.420 1985.991 2120.308 2121.528 2169.061
[22] 1936.529 1903.802 2217.704 2229.070 2273.133 2239.304 2362.105
[29] 2424.371 2114.407 2250.187 2252.868 2301.862 2070.790 2039.522
[36] 2354.883
Improved_forecast <- ts(Improved_forecast, start=c(2001,4), end=c(2004,3), freq=12) # Convert as time-series
# Accuracy
acc.Regression <- accuracy(Improved_forecast, test.ts)
acc.Regression
ME RMSE MAE MPE MAPE ACF1
Test set -126.4976 153.2637 131.651 -6.447937 6.695432 0.7054902
Theil's U
Test set 0.8962032
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = Improved_forecast, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
dlm
package 안에 있는 여러 함수들을 이용하여 추정과 예측을 실시할 수 있다.# Fit Dynamic linear model (DLM)
DLM.model <- function(p){
mod <- dlmModPoly(2) + # local trend linear
dlmModSeas(12)
V(mod) <- exp(p[1])
diag(W(mod))[1:3] <- exp(p[2:4])
return(mod)
}
mle1 <- dlmMLE(train.ts, parm = c(0.1,0.1,1,1), build = DLM.model ) # Estimation parameter through MLE. Parameter= Variance of error
ifelse(mle1$convergence==0, print("converge"), print("did not converge") ) # Check Convergence
[1] "converge"
[1] "converge"
DLM.model.fit <- DLM.model(mle1$par) # Fitting the DLM
V(DLM.model.fit)
[,1]
[1,] 1413.91
W(DLM.model.fit)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 1143.988 0.0000000 0.00000 0 0 0 0 0 0 0
[2,] 0.000 0.2040261 0.00000 0 0 0 0 0 0 0
[3,] 0.000 0.0000000 14.47867 0 0 0 0 0 0 0
[4,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[5,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[6,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[7,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[8,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[9,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[10,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[11,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[12,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[13,] 0.000 0.0000000 0.00000 0 0 0 0 0 0 0
[,11] [,12] [,13]
[1,] 0 0 0
[2,] 0 0 0
[3,] 0 0 0
[4,] 0 0 0
[5,] 0 0 0
[6,] 0 0 0
[7,] 0 0 0
[8,] 0 0 0
[9,] 0 0 0
[10,] 0 0 0
[11,] 0 0 0
[12,] 0 0 0
[13,] 0 0 0
# Estimation for Kalman filtering
filtering <- dlmFilter(train.ts, DLM.model.fit)
# Plot estimation for filtering model
plot(dropFirst(filtering$f), col = "blue", lwd = 2, lty = 2, ylab = "Ridership")
lines(train.ts ,lty = 1, lwd = 2, col = "black")
legend("bottomleft", legend = c("Data", "Fitted Data"), col = c("black", "blue"), lty = 1:2, lwd = 2)
# Residual(Check independence and normality)
par(mfrow=c(1,1))
plot(residuals(filtering, sd = FALSE), ylab="Residual")
abline(h=0)
tsdiag(filtering, main = "Diagnostics for Regression Model")
# Forecast
forecast_DLM <- dlmForecast(filtering, nAhead = n.test) # Forecast(filtering model)
forecast_DLM$f
Jan Feb Mar Apr May Jun Jul
2001 1996.881 2024.378 1992.103 2100.250
2002 1743.912 1710.032 2013.061 2016.583 2044.080 2011.805 2119.952
2003 1763.614 1729.734 2032.762 2036.285 2063.782 2031.507 2139.654
2004 1783.316 1749.436 2052.464
Aug Sep Oct Nov Dec
2001 2148.355 1830.904 1959.455 1953.408 1993.073
2002 2168.057 1850.606 1979.157 1973.110 2012.775
2003 2187.759 1870.308 1998.859 1992.812 2032.477
2004
# Accuracy
acc.DLM <- accuracy(forecast_DLM$f, test.ts)
acc.DLM
ME RMSE MAE MPE MAPE ACF1
Test set 18.02393 67.51251 55.10431 0.8020044 2.784397 0.5948375
Theil's U
Test set 0.3893913
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = forecast_DLM$f, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
set.seed(100)
bagged <- train.ts %>%
baggedModel(bld.mbb.bootstrap(train.ts, 100),fn = auto.arima) # Generate 100set Bootstrap Samples and Fitting auto.arima for Each Bootstrap Sample Set
# Forecast
forecast_bagged <- bagged %>%
forecast(h = n.test) # Auto.arima is more accurate than ets. / Forecaste Each Fitted ARIMA Model by bootstrap sample set. Then, Calculate Mean
forecast_bagged$mean
Jan Feb Mar Apr May Jun Jul
2001 2007.388 2043.424 2002.164 2114.229
2002 1782.007 1738.522 2047.303 2048.708 2083.499 2041.520 2153.640
2003 1823.123 1779.574 2090.513 2092.330 2127.087 2085.660 2198.148
2004 1868.835 1825.044 2135.085
Aug Sep Oct Nov Dec
2001 2162.860 1847.796 1980.844 1973.089 2012.932
2002 2203.439 1888.618 2021.920 2014.610 2055.572
2003 2248.026 1933.322 2065.527 2059.721 2100.603
2004
# Accuracy
acc.Bagging <- accuracy(forecast_bagged$mean, test.ts)
acc.Bagging
ME RMSE MAE MPE MAPE ACF1
Test set -25.03095 64.975 45.95837 -1.365154 2.373141 0.507273
Theil's U
Test set 0.3768781
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = forecast_bagged$mean, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
계절 주기가 길 때
, 계절성 ARIMA보다 선호되는 모델이 Dynamic Harmonic Regression (DHR)
이다.계절성을 설명하는 퓨리에 항(Fourier Terms)
을 가진 회귀 모형이다.DHR.fit <- auto.arima(train.ts,
xreg = as.matrix(Train.X), # Include Fourier Terms
seasonal = FALSE)
DHR.fit
Series: train.ts
Regression with ARIMA(2,1,2) errors
Coefficients:
ar1 ar2 ma1 ma2 Month S1-12 C1-12
-1.1998 -0.4967 0.2032 -0.4963 43.3345 109.4259 -150.3196
s.e. 0.1349 0.1294 0.1091 0.1353 9.2838 35.5583 12.9651
S2-12 C2-12
40.0290 -18.4092
s.e. 18.2969 12.3742
sigma^2 estimated as 9891: log likelihood=-730.38
AIC=1480.76 AICc=1482.74 BIC=1508.8
checkresiduals(DHR.fit)
Ljung-Box test
data: Residuals from Regression with ARIMA(2,1,2) errors
Q* = 95.067, df = 15, p-value = 1.115e-13
Model df: 9. Total lags used: 24
# Forecast
DHR.forecast <- forecast(DHR.fit, xreg = as.matrix(Test.X))
DHR.forecast$mean
Jan Feb Mar Apr May Jun Jul
2001 1957.800 1980.577 2090.017 2020.626
2002 1651.827 1801.555 1912.521 1972.079 2010.833 2046.623 2057.661
2003 1647.319 1804.050 1911.767 1971.744 2011.609 2045.859 2058.193
2004 1647.258 1804.075 1911.768
Aug Sep Oct Nov Dec
2001 2048.011 1943.821 1891.313 1907.054 1999.249
2002 2025.131 1952.878 1891.812 1901.957 2005.117
2003 2024.872 1952.924 1891.885 1901.846 2005.213
2004
# Accuracy
acc.DHR <- accuracy(DHR.forecast$mean, test.ts)
acc.DHR
ME RMSE MAE MPE MAPE ACF1
Test set 58.24237 118.5707 97.03389 2.697112 4.885403 0.02801658
Theil's U
Test set 0.6815818
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = DHR.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
STL (Seasonal and Trend decomposition using Loess)
을 이용하여 시계열 데이터를 계절 성분(Seasonal Component)과 추세 + 오차 성분(Seasonally Adjusted Component)로 분해하고 각 성분을 따로 예측 후 더하여 최종 예측이 이루어진다.# Ref. https://github.com/robjhyndman/forecast/blob/master/R/mstl.R
STLM.fit <- train.ts %>%
stlm(method = "arima", xreg = Train.X) # 시계열을 분해하고 추세 + 오차 성분에 DHR 모형 적합 => 예측 변수에 Fourier TermsSTLM.fit$model
Series: x
Regression with ARIMA(0,1,1) errors
Coefficients:
ma1 Month S1-12 C1-12 S2-12 C2-12
-0.4497 0.3528 6.5749 0.1468 -0.8428 -0.1513
s.e. 0.0958 2.2538 11.8652 8.6959 7.0494 6.1778
sigma^2 estimated as 2881: log likelihood=-656.06
AIC=1326.11 AICc=1327.1 BIC=1345.74
# Forecast
STLM.forecast <- forecast(STLM.fit, h = n.test, # Trend + Error부분을 예측하고 Seasonal naive method(같은 시즌의 마지막 관측값=예측)를 이용하여 Seasonal 예측하여 더함NA= Test.X)
STLM.forecast$mean
Jan Feb Mar Apr May Jun Jul
2001 1999.378 2030.771 2016.893 2115.862
2002 1722.372 1688.243 1990.391 1999.378 2030.771 2016.893 2115.862
2003 1722.372 1688.243 1990.391 1999.378 2030.771 2016.893 2115.862
2004 1722.372 1688.243 1990.391
Aug Sep Oct Nov Dec
2001 2149.749 1831.360 1966.957 1959.655 1990.394
2002 2149.749 1831.360 1966.957 1959.655 1990.394
2003 2149.749 1831.360 1966.957 1959.655 1990.394
2004
# Accuracy
acc.STLM <- accuracy(STLM.forecast$mean, test.ts)
acc.STLM
ME RMSE MAE MPE MAPE ACF1
Test set 38.04171 83.77231 70.30234 1.821433 3.555029 0.694755
Theil's U
Test set 0.4885911
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = STLM.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
Structural Time Seires (STS) 모형에 Bayesian 방법을 적용
하는 방법이다.bsts
를 통해 다룰 수 있다.Train.Data <- data.frame("y"= train.ts, Train.X[,1]) # Excluding Fourier Terms
ss <- list()
# Local Linear Trend
ss <- bsts::AddLocalLinearTrend(ss, train.ts)
# Seasonal
ss <- bsts::AddSeasonal(ss, train.ts, nseasons = 12, season.duration = 1) # cycle = season.duration(how many time points each season) * nseasons
BSTS.fit <- bsts(y ~., state.specification = ss, data = Train.Data, niter = 1000, seed=100) # niter : MCMC 반복복
=-=-=-=-= Iteration 0 Thu Nov 18 00:58:30 2021
=-=-=-=-=
=-=-=-=-= Iteration 100 Thu Nov 18 00:58:31 2021
=-=-=-=-=
=-=-=-=-= Iteration 200 Thu Nov 18 00:58:31 2021
=-=-=-=-=
=-=-=-=-= Iteration 300 Thu Nov 18 00:58:31 2021
=-=-=-=-=
=-=-=-=-= Iteration 400 Thu Nov 18 00:58:32 2021
=-=-=-=-=
=-=-=-=-= Iteration 500 Thu Nov 18 00:58:32 2021
=-=-=-=-=
=-=-=-=-= Iteration 600 Thu Nov 18 00:58:32 2021
=-=-=-=-=
=-=-=-=-= Iteration 700 Thu Nov 18 00:58:33 2021
=-=-=-=-=
=-=-=-=-= Iteration 800 Thu Nov 18 00:58:33 2021
=-=-=-=-=
=-=-=-=-= Iteration 900 Thu Nov 18 00:58:33 2021
=-=-=-=-=
summary(BSTS.fit)
$residual.sd
[1] 38.38387
$prediction.sd
[1] 86.37815
$rsquare
[1] 0.9424972
$relative.gof
[1] 0.7358398
$size
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.07092 0.00000 1.00000
$coefficients
mean sd mean.inc sd.inc inc.prob
Train.X...1. 0.1077952 0.539501 1.519912 1.414357 0.07092199
(Intercept) 0.0000000 0.000000 0.000000 0.000000 0.00000000
# Forecast
burn <- SuggestBurn(0.1, BSTS.fit)
BSTS.forecast <- predict(BSTS.fit, horizon = n.test, # horizon : the number of prediction
burn = burn, newdata = Test.X[,1], # newdata : 예측변수를 포함하는 변수
quantiles = c(0.025, 0.975))
BSTS.forecast$mean
[1] 2008.649 2033.859 2005.471 2115.656 2159.608 1849.589 1985.606
[8] 1980.895 2025.518 1771.432 1738.900 2042.580 2046.658 2077.231
[15] 2049.163 2155.931 2204.486 1889.507 2028.428 2029.719 2066.013
[22] 1816.823 1787.502 2091.551 2097.854 2122.777 2100.560 2208.762
[29] 2256.867 1945.645 2080.120 2076.238 2118.645 1870.599 1843.099
[36] 2145.638
# Accuracy
acc.BSTS <- accuracy(BSTS.forecast$mean, test.ts)
acc.BSTS
ME RMSE MAE MPE MAPE ACF1
Test set -29.77804 66.58763 45.96603 -1.599523 2.379736 0.5265906
Theil's U
Test set 0.3875899
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = BSTS.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
복잡한 계절성
을 가진 시계열 데이터를 분석하는 데 유용하다.광범위한 계절 패턴 변동
과 관련된 문제를 극복하고 상관성이 있는 오차
를 처리하기 위해 지수 평활(Exponential Smoothing)
을 사용한 수정된 상태공간 모형으로 De Livera et al. (2011)이 제안하였다cl <- parallel::makeCluster(detectCores(), setup_timeout = 0.5)
TBATS.fit <- train.ts %>%
tbats(use.box.cox = FALSE,
use.trend = TRUE,
use.damped.trend = TRUE,
use.parallel = TRUE,
num.cores = cl)
summary(TBATS.fit)
Length Class Mode
lambda 0 -none- NULL
alpha 1 -none- numeric
beta 1 -none- numeric
damping.parameter 1 -none- numeric
gamma.one.values 1 -none- numeric
gamma.two.values 1 -none- numeric
ar.coefficients 0 -none- NULL
ma.coefficients 0 -none- NULL
likelihood 1 -none- numeric
optim.return.code 1 -none- numeric
variance 1 -none- numeric
AIC 1 -none- numeric
parameters 2 -none- list
seed.states 12 -none- numeric
fitted.values 123 ts numeric
errors 123 ts numeric
x 1476 -none- numeric
seasonal.periods 1 -none- numeric
k.vector 1 -none- numeric
y 123 ts numeric
p 1 -none- numeric
q 1 -none- numeric
call 7 -none- call
series 1 -none- character
method 1 -none- character
# Forecast
TBATS.forecast <- forecast(TBATS.fit, h=n.test)
TBATS.forecast$mean
Jan Feb Mar Apr May Jun Jul
2001 1978.995 2028.484 1959.201 2089.790
2002 1721.923 1658.583 1979.689 1966.634 2018.589 1951.281 2083.449
2003 1720.255 1657.248 1978.620 1965.779 2017.904 1950.732 2083.010
2004 1720.140 1657.155 1978.546
Aug Sep Oct Nov Dec
2001 2116.995 1814.845 1914.140 1924.798 1938.481
2002 2111.919 1810.782 1910.888 1922.195 1936.397
2003 2111.568 1810.501 1910.663 1922.015 1936.252
2004
# Accuracy
acc.TBATS <- accuracy(TBATS.forecast$mean, test.ts)
acc.TBATS
ME RMSE MAE MPE MAPE ACF1
Test set 69.36452 103.114 88.31896 3.379683 4.42057 0.556916
Theil's U
Test set 0.5999032
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = TBATS.forecast$mean, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
# 1. Line Plot
Pred.Data <- data.frame("Date" = tail(Amtrak.data$Month, n.test),
"Observation" = test.ts,
"Regression" = Improved_forecast,
"DLM" = c(forecast_DLM$f),
"Bootstrap" = forecast_bagged$mean,
"DHR" = DHR.forecast$mean,
"STLM" = STLM.forecast$mean,
"BSTS" = BSTS.forecast$mean,
"TBATS" = TBATS.forecast$mean)
Pred.Models <- reshape2::melt(Pred.Data, id.vars="Date",
variable.name="Type",
value.name='y')
models <- unique(Pred.Models$Type)
lcols <- c("#000000", "#66CC99", "#ae5d8b", "#c1c084", "#d38b72", "#dc143c", "#00498c", "#9999CC")
ggplot(Pred.Models, aes(x=Date, y=y, group=Type)) +
geom_line(aes(color=Type), size=1, linetype="solid") +
labs(x="Date", y="Ridership") + # y="log(Charging demand)"
scale_color_manual(values = lcols, name = 'Models', labels = models) +
#scale_x_datetime(date_breaks= "24 hour", date_labels = paste("%Y- %m-%d %H", "h", sep="")) + # %Y- %m-%d %H:%M:%S"
theme_bw() +
theme(axis.text.y=element_text(size=16),
axis.title=element_text(size=20),
axis.text.x=element_text(angle=30, hjust=1, size=12),
legend.title = element_text(size=18),
legend.text = element_text(size=16))
# 2. Scatter Plot
Pred.Models2 <- reshape2::melt(Pred.Data[,-1], id.vars="Observation",
variable.name="Type",
value.name='Pred')
ggplot(Pred.Models2, aes(x=Observation, y=Pred)) +
geom_point(color="#d38b72") +
geom_abline(intercept = 0, slope = 1, size=1, colour="#9999CC", linetype="dashed") +
facet_wrap( ~ Type, ncol=2) +
labs(x="Observation", y="Prediction", color = "") +
theme_bw() +
theme(axis.title=element_text(size=20),
axis.text=element_text(size=15),
strip.text.x = element_text(size=18, face="bold"))
# Ref. https://petolau.github.io/Regression-trees-for-forecasting-time-series-in-R/
decomp.ts <- stl(train.ts, , s.window = "periodic", robust = TRUE)$time.series
# decomp.ts <- mstl(Power.msts, s.window = "periodic", robust = TRUE) # 다중 계절성인 경우NA# Target without Trend
Target <- decomp.ts %>%
data.frame %>%
rowwise() %>% # 행별로 작업 dplyr::mutate(y=sum( seasonal, remainder )) %>% # Target = Season + Remainder => Detrend
dplyr::select(y)
Train.Data <- cbind(Target, Train.X)
trend.part <- data.frame(decomp.ts)$trend %>% # Only Trend of Training Data
ts()
# Fitting ARIMA for Trend
trend.fit.arima <- auto.arima(trend.part)
# Forecast
trend.arima.pred <- forecast(trend.fit.arima, n.test)
trend.arima.pred$mean
Time Series:
Start = 124
End = 159
Frequency = 1
[1] 1963.065 1966.790 1970.428 1974.067 1977.679 1981.291 1984.895
[8] 1988.498 1992.099 1995.700 1999.301 2002.901 2006.501 2010.101
[15] 2013.701 2017.301 2020.901 2024.501 2028.101 2031.701 2035.301
[22] 2038.901 2042.501 2046.101 2049.701 2053.301 2056.901 2060.501
[29] 2064.101 2067.701 2071.301 2074.901 2078.501 2082.101 2085.701
[36] 2089.301
시차값
이 신경망의 입력값
으로써 사용될 수 있으며 이것을 신경망 자기회귀 또는 NNAR 모형이라고 부른다.set.seed(100)
neural_model <- nnetar(ts(Target, start=c(1991,1), end=c(2001,3), freq = 12), # Or train.ts
xreg = Train.X,
repeats = 200,
lambda = "auto")
neural_model
Series: ts(Target, start = c(1991, 1), end = c(2001, 3), freq = 12)
Model: NNAR(1,1,4)[12]
Call: nnetar(y = ts(Target, start = c(1991, 1), end = c(2001, 3), freq = 12),
repeats = 200, xreg = Train.X, lambda = "auto")
Average of 200 networks, each of which is
a 7-4-1 network with 37 weights
options were - linear output units
sigma^2 estimated as 2163
# Accuracy
acc.NNAR <- accuracy(NNAR.forecast, test.ts)
acc.NNAR
ME RMSE MAE MPE MAPE ACF1
Test set -30.87089 74.79254 47.02293 -1.6211 2.447256 0.5835289
Theil's U
Test set 0.4346942
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = NNAR.forecast, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
mtry
(랜덤적으로 선택되는 후보 예측 변수 갯수)를 가진다.set.seed(100)
fitControl <- trainControl(method = "adaptive_cv", # cv, repeatedcv
number = 5,
repeats = 5,
adaptive = list(min = 5,
alpha = 0.05,
method = "BT",
complete = TRUE),
search = "random",
allowParallel = TRUE)
RF <- function(train, tuneLength, ntree = 500, nodesize = 5){
set.seed(100) # seed 고정 For Cross Validation
caret.rf <- caret::train(y~., data = train,
method = "parRF", # Tune Parameter : mtry
trControl = fitControl,
tuneLength = tuneLength,
ntree = ntree,
nodesize = nodesize, # nodesize : Terminal Node의 최소 크기NA= TRUE)
return(caret.rf)
}
RF.Caret <- RF(Train.Data,
tuneLength = 2, # tuneLength (탐색할 후보 모수 갯수)
ntree = 100) # ntree : 생성할 Tree 수
RF.Caret
Parallel Random Forest
123 samples
5 predictor
No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE Resamples
1 52.87737 0.8892022 40.21114 25
3 50.21355 0.8949560 38.53876 25
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 3.
RF.Caret$finalModel
Call:
randomForest(x = "x", y = "y", ntree = 13, mtry = 3L, nodesize = 5, importance = TRUE)
Type of random forest: regression
Number of trees: 104
No. of variables tried at each split: 3
RF.Caret$finalModel$tuneValue
mtry
2 3
# Final Prediction (Detrended + Trend)
Pred.RF <- predict(RF.Caret, Test.X) + trend.arima.pred$mean
# Accuracy
acc_RF <- accuracy(c(Pred.RF), test.ts)
acc_RF
ME RMSE MAE MPE MAPE ACF1
Test set -29.23913 71.90871 46.82079 -1.546831 2.423727 0.6008866
Theil's U
Test set 0.4190109
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = Pred.RF, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
nrounds
: 반복 수max_depth
: Tree의 최대 깊이eta
: Learning Lategamma
: 분할하기 위해 필요한 최소 손실 감소, 클수록 분할이 쉽게 일어나지 않음colsample_bytree
: Tree 생성 때 사용할 예측변수 비율min_child_weight
: 한 leaf 노드에 요구되는 관측치에 대한 가중치의 최소 합subsample
: 모델 구축시 사용할 Data비율로 1이면 전체 Data 사용set.seed(100)
fitControl <- trainControl(method = "adaptive_cv", # cv, repeatedcv
number = 5,
repeats = 5,
adaptive = list(min = 5,
alpha = 0.05,
method = "BT",
complete = TRUE),
search = "random",
allowParallel = TRUE)
XGBoost <- function(train, tuneLength){
set.seed(100) # seed 고정 For Cross Validation
caret.xgb <- caret::train(y~., data = train,
method = "xgbTree",
trControl = fitControl,
# objective = "reg:squarederror", # error(The following parameters were provided multiple times)
tuneLength = tuneLength # tuneLength (탐색할 후보 모수 갯수)
)
return(caret.xgb)
}
XGB.Caret <- XGBoost(Train.Data, 2)
XGB.Caret
eXtreme Gradient Boosting
123 samples
5 predictor
No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ...
Resampling results across tuning parameters:
eta max_depth gamma colsample_bytree min_child_weight
0.2228220 9 1.702621 0.6528662 3
0.4876292 3 5.465586 0.5499986 5
subsample nrounds RMSE Rsquared MAE Resamples
0.7517663 624 50.21422 0.8931412 38.60326 25
0.8219133 358 50.06911 0.8961369 38.41744 25
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 358, max_depth
= 3, eta = 0.4876292, gamma = 5.465586, colsample_bytree
= 0.5499986, min_child_weight = 5 and subsample = 0.8219133.
XGB.Caret$finalModel
##### xgb.Booster
raw: 181.9 Kb
call:
xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
eta = "0.48762916797353", max_depth = "3", gamma = "5.46558595029637", colsample_bytree = "0.549998590815812", min_child_weight = "5", subsample = "0.82191331172362", objective = "reg:linear", silent = "1"
xgb.attributes:
niter
callbacks:
cb.print.evaluation(period = print_every_n)
# of features: 5
niter: 358
nfeatures : 5
xNames : Month `S1-12` `C1-12` `S2-12` `C2-12`
problemType : Regression
tuneValue :
nrounds max_depth eta gamma colsample_bytree
2 358 3 0.4876292 5.465586 0.5499986
min_child_weight subsample
2 5 0.8219133
obsLevels : NA
param :
list()
XGB.Caret$finalModel$tuneValue
nrounds max_depth eta gamma colsample_bytree
2 358 3 0.4876292 5.465586 0.5499986
min_child_weight subsample
2 5 0.8219133
# Final Prediction (Detrended + Trend)
Pred.XGB <- predict(XGB.Caret, Test.X) + trend.arima.pred$mean
# Accuracy
acc_XGB <- accuracy(c(Pred.XGB), test.ts)
acc_XGB
ME RMSE MAE MPE MAPE ACF1
Test set -27.27012 68.20048 47.95543 -1.475506 2.477208 0.5023489
Theil's U
Test set 0.3950394
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = Pred.XGB, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
# Ref. https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html
# https://github.com/zachmayer/caretEnsemble/blob/master/R/caretStack.R
# 1. Modeling for Stacking (Declare Individual Model)
set.seed(100)
fitControl <- trainControl(method = "repeatedcv", # adaptive_cv
number = 5,
repeats = 5,
# adaptive = list(min = 5,
# alpha = 0.05,
# method = "BT",
# complete = TRUE),
# search = "random", # grid
savePredictions = "final", # 최적 모수에 대한 예측 저장NA# classProbs = TRUE, # 각 클래스에 대한 확률 저장(Classification))
index = createResample(Train.Data$y, 1), # index : 각 resapling에 대한 요소, Training에 사용되는 행 번호/ createResample : 붓스트랩스트랩
allowParallel = TRUE)
# 원본 Training Data에 학습시킨 Hyperparameter 결과과
alg_tune_list <- list( # Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
parRF = caretModelSpec(method="parRF",
importance = TRUE,
nodeside = 5,
tuneGrid = expand.grid(mtry=RF.Caret$finalModel$tuneValue$mtry)),
xgbTree = caretModelSpec(method="xgbTree",
tuneGrid = expand.grid(nrounds = XGB.Caret$finalModel$tuneValue$nrounds,
max_depth = XGB.Caret$finalModel$tuneValue$max_depth,
eta = XGB.Caret$finalModel$tuneValue$eta,
gamma = XGB.Caret$finalModel$tuneValue$gamma,
colsample_bytree = XGB.Caret$finalModel$tuneValue$colsample_bytree,
min_child_weight = XGB.Caret$finalModel$tuneValue$min_child_weight,
subsample = XGB.Caret$finalModel$tuneValue$subsample)))
set.seed(100)
multi_mod <- caretList(y~., data = Train.Data, trControl = fitControl,
tuneList = alg_tune_list) # search = "grid"
multi_mod$parRF
Parallel Random Forest
123 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 123
Resampling results:
RMSE Rsquared MAE
55.73908 0.8368916 42.4932
Tuning parameter 'mtry' was held constant at a value of 3
multi_mod$xgbTree
eXtreme Gradient Boosting
123 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 123
Resampling results:
RMSE Rsquared MAE
58.30165 0.827723 45.50672
Tuning parameter 'nrounds' was held constant at a value of 358
held constant at a value of 5
Tuning parameter 'subsample' was
held constant at a value of 0.8219133
# 2. Stacking (개별 모형들의 예측값을 결합한 Data를 Training data로 쓰는 Final Model))
set.seed(100)
stackControl <- trainControl(method = "adaptive_cv",
number = 5,
repeats = 5,
adaptive = list(min = 5,
alpha = 0.05,
method = "BT",
complete = TRUE),
search = "random",
allowParallel = TRUE)
set.seed(100)
stack.xgb <- caretStack(multi_mod, method = "xgbTree", # Final Model
trControl = stackControl,
tuneLength = 2) # 모수 후보 갯수수
stack.xgb
A xgbTree ensemble of 2 base models: parRF, xgbTree
Ensemble results:
eXtreme Gradient Boosting
38 samples
2 predictor
No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 30, 31, 31, 30, 30, 30, ...
Resampling results across tuning parameters:
eta max_depth gamma colsample_bytree min_child_weight
0.2228220 9 1.702621 0.6528662 3
0.4876292 3 5.465586 0.5499986 5
subsample nrounds RMSE Rsquared MAE Resamples
0.7517663 624 48.62987 0.929678 38.75318 25
0.8219133 358 57.27670 0.924131 43.13093 25
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 624, max_depth
= 9, eta = 0.222822, gamma = 1.702621, colsample_bytree =
0.6528662, min_child_weight = 3 and subsample = 0.7517663.
stack.xgb$ens_model$finalModel
##### xgb.Booster
raw: 357.5 Kb
call:
xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
eta = "0.222822001739405", max_depth = "9", gamma = "1.70262051047757", colsample_bytree = "0.652866207249463", min_child_weight = "3", subsample = "0.751766284287442", objective = "reg:linear", silent = "1"
xgb.attributes:
niter
callbacks:
cb.print.evaluation(period = print_every_n)
# of features: 2
niter: 624
nfeatures : 2
xNames : parRF xgbTree
problemType : Regression
tuneValue :
nrounds max_depth eta gamma colsample_bytree
1 624 9 0.222822 1.702621 0.6528662
min_child_weight subsample
1 3 0.7517663
obsLevels : NA
param :
list()
stack.xgb$ens_model$finalModel$tuneValue
nrounds max_depth eta gamma colsample_bytree
1 624 9 0.222822 1.702621 0.6528662
min_child_weight subsample
1 3 0.7517663
# Final Prediction (Detrended + Trend)
stack.Pred.XGB <- predict(stack.xgb, Test.X) + trend.arima.pred$mean
# Accuracy
acc_stack.XGB <- accuracy(c(stack.Pred.XGB), test.ts)
acc_stack.XGB
ME RMSE MAE MPE MAPE ACF1
Test set -29.89315 88.01831 65.54621 -1.617441 3.344385 0.3068341
Theil's U
Test set 0.512367
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = stack.Pred.XGB, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
# Ref. https://github.com/zachmayer/caretEnsemble/blob/master/R/caretEnsemble.R
set.seed(100)
greedyEnsemble <- caretEnsemble(multi_mod, trControl = trainControl(method = "cv", number=5))
greedyEnsemble
A glm ensemble of 2 base models: parRF, xgbTree
Ensemble results:
Generalized Linear Model
38 samples
2 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 30, 31, 31, 30, 30
Resampling results:
RMSE Rsquared MAE
58.25038 0.8224663 45.03724
greedyEnsemble$ens_model$finalModel
Call: NULL
Coefficients:
(Intercept) parRF xgbTree
-8.46746 0.93423 0.02958
Degrees of Freedom: 37 Total (i.e. Null); 35 Residual
Null Deviance: 703200
Residual Deviance: 114700 AIC: 420.3
summary(greedyEnsemble)
The following models were ensembled: parRF, xgbTree
They were weighted:
-8.4675 0.9342 0.0296
The resulting RMSE is: 58.2504
The fit for each individual model on the RMSE is:
method RMSE RMSESD
parRF 55.73908 NA
xgbTree 58.30165 NA
# Final Prediction (Detrended + Trend)
stack.Pred.GLM <- predict(greedyEnsemble, Test.X) + trend.arima.pred$mean
# Accuracy
acc_stack.GLM <- accuracy(c(stack.Pred.GLM), test.ts)
acc_stack.GLM
ME RMSE MAE MPE MAPE ACF1
Test set -19.94278 66.9187 44.73519 -1.09601 2.30595 0.5677311
Theil's U
Test set 0.388657
# Prediction Plot
ggplot(tail(Amtrak.data, n.test), aes(x = Month)) +
geom_line(aes(y = Ridership, colour = "Observation", linetype = "Observation")) +
geom_line(aes(y = stack.Pred.GLM, colour = "Prediction", linetype = "Prediction")) +
scale_color_manual(name = "", values = c("Observation" = "black", "Prediction" = "blue")) +
scale_linetype_manual("", values = c("Observation"= 1, "Prediction" = 2)) +
theme_bw()
# 1. Line Plot
Pred.Data3 <- data.frame("Date" = tail(Amtrak.data$Month, n.test),
"Observation" = test.ts,
"NNAR" = NNAR.forecast,
"RF" = c(Pred.RF),
"XGBoost" = c(Pred.XGB),
"Stack.XGBoost" = c(stack.Pred.XGB),
"Stack.GLM" = c(stack.Pred.GLM))
Pred.Models3 <- reshape2::melt(Pred.Data3, id.vars="Date",
variable.name="Type",
value.name='y')
models2 <- unique(Pred.Models3$Type)
lcols <- c("#000000", "#66CC99", "#ae5d8b", "#c1c084", "#d38b72", "#dc143c", "#00498c", "#9999CC")
ggplot(Pred.Models3, aes(x=Date, y=y, group=Type)) +
geom_line(aes(color=Type), size=1, linetype="solid") +
labs(x="Date", y="Ridership") + # y="log(Charging demand)"
scale_color_manual(values = lcols, name = 'Models', labels = models2) +
#scale_x_datetime(date_breaks= "24 hour", date_labels = paste("%Y- %m-%d %H", "h", sep="")) + # %Y- %m-%d %H:%M:%S"
theme_bw() +
theme(axis.text.y=element_text(size=16),
axis.title=element_text(size=20),
axis.text.x=element_text(angle=30, hjust=1, size=12),
legend.title = element_text(size=18),
legend.text = element_text(size=16))
# 2. Scatter Plot
Pred.Models4 <- reshape2::melt(Pred.Data3[,-1], id.vars="Observation",
variable.name="Type",
value.name='Pred')
ggplot(Pred.Models4, aes(x=Observation, y=Pred)) +
geom_point(color="#d38b72") +
geom_abline(intercept = 0, slope = 1, size=1, colour="#9999CC", linetype="dashed") +
facet_wrap( ~ Type, ncol=2) +
labs(x="Observation", y="Prediction", color = "") +
theme_bw() +
theme(axis.title=element_text(size=20),
axis.text=element_text(size=15),
strip.text.x = element_text(size=18, face="bold"))
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".