Bayesian Structural Time Seires

Time Series

Bayesian Structural Time Seires for Time Series Data

Yeongeun Jeon , Jung In Seo
06-30-2021

Introduction

\[ \begin{aligned} Y_{t}&=Z^{T}_{t}\alpha_{t}+\epsilon_{t},~~~~~~~\epsilon_{t}\sim N_{m}(0,H_{t}), \\ \alpha_{t+1}&=T_{t}\alpha_{t}+R_{t}\eta_{t},~~~\eta_{t}\sim N_{q}(0,Q_{t}),\\ \alpha_{1} &\sim N_{p} (\mu_{1}, P_{1}) \end{aligned} \]


BSTS Package


Trend

Local Level Model

\[ \begin{aligned} Y_{t} &= \mu_{t} + \epsilon_{t},~~~~\epsilon_{t}\sim N(0, \sigma^2_{\epsilon})\\ \mu_{t+1} &= \mu_{t} + \xi_{t}, ~~~\xi_{t}\sim N(0,\sigma^2_{\xi }), \end{aligned} \]

ss <- list()
ss <- bsts::AddLocalLevel(ss, y)   # y : The time series to be modeled

Local Linear Trend Model

\[ \begin{aligned} Y_{t} &= \mu_{t} + \epsilon_{t},~~~~~~~~~~\epsilon_{t}\sim N(0, \sigma^2_{\epsilon})\\ \mu_{t+1} &= \mu_{t} + \delta_{t} + \xi_{t}, ~~~\xi_{t}\sim N(0,\sigma^2_{\xi }),\\ \delta_{t+1} &= \delta_{t} + \zeta_{t}, ~~~~~~~~~~~~\zeta_{t}\sim N(0,\sigma^2_{\zeta}), \end{aligned} \]

ss <- list()
ss <- bsts::AddLocalLinearTrend(ss, y)   # y : The time series to be modeled

Seasonality

Regression with Seasonal Dummy Variables

\[ \begin{aligned} Y_{t} &= \tau_{t} + \epsilon_{t},~~~~~~~~~~\epsilon_{t}\sim N(0, \sigma^2_{\epsilon})\\ \tau_{t+1} &= -\sum_{s=1}^{S-1} \tau_{t+1-s} + \omega_{t}, ~~~\omega_{t}\sim N(0,\sigma^2_{\omega}), \end{aligned} \]

ss <- list()
ss <- bsts::AddSeasonal(ss, y,           # y : The time series to be modeled
                        nseasons,        # season.duration의 반복 수
                        season.duration) # 각 시즌에서 관측수# cycle (s) = season.duration * nseasons

Application


Data 불러오기

pacman::p_load("dplyr", "bsts", "forecast", "ggplot2")

# In Mac
# guess_encoding("Amtrak.csv")
# Amtrak.data <- read.csv("Amtrak.csv", fileEncoding="EUC-KR")

Amtrak.data  <- read.csv("C:/Users/User/Desktop/Amtrak.csv")
ridership.ts <- ts(Amtrak.data$Ridership, start=c(1991,1), end=c(2004,3), freq=12)

Data 분할

train.ts  <- window(ridership.ts,start=c(1991,1), end=c(2001,3))   # Training Data
test.ts   <- window(ridership.ts,start=c(2001,4))                  # Test Data
n.test    <- length(test.ts)

정규성 확인

par(mfrow=c(2,1))
hist(train.ts, prob=TRUE, 12)
lines(density(train.ts), col="blue")
qqnorm(train.ts)
qqline(train.ts)


모형 적합

bsts(formula, state.specification, family = c("gaussian", "logit", "poisson", "student"), data, niter, seed = NULL, ...)

\[ \begin{aligned} Y_{t} &= \mu_{t} + \tau_{t} + \beta^{T}x_{t} + \epsilon_{t},~~~~\epsilon_{t}\sim N(0, \sigma^2_{\epsilon})\\ \mu_{t+1} &= \mu_{t} + \delta_{t} + \xi_{t}, ~~~~~~~~~\xi_{t}\sim N(0,\sigma^2_{\xi }),\\ \delta_{t+1} &= \delta_{t} + \zeta_{t}, ~~~~~~~~~~~~~~~~~~~~~\zeta_{t}\sim N(0,\sigma^2_{\zeta})\\ \tau_{t+1}&=-\sum_{s=1}^{S-1} \tau_{t+1-s} + \omega_{t}, ~~~~~~~~~~\omega_{t}\sim N(0,\sigma^2_{\omega }) \end{aligned} \] - Basic STS Model은 \(Z^{T}_{t} = (1,0,1,\ldots, 0)\), \(T_{t} = \left[\begin{smallmatrix} 1 & 1 & \\ 0 & 1 & \\ & & -1 & - 1 & \cdots & -1 & -1 \\ & & 1 & 0 & \cdots &0& 0\\ & & 0 & 1 & \cdots & 0 &0 \\ & & \vdots &\vdots &\vdots &\vdots &\vdots &\\ & & 0 & 0 & \cdots & 1 & 0 \end{smallmatrix}\right]\), \(\alpha_{t} = (\mu_{t}, \delta_{t}, \tau_{t}, \ldots, \tau_{t-S+2})^{T}\), \(R_{t}=\left[\begin{smallmatrix}1 & 0 \\ 0 & 1 \\ & & 1 \\ & & 0 \\ & & \vdots \\ & & 0 \\ \end{smallmatrix}\right]\), \(\eta_{t}=(\xi_{t}, \zeta_{t},\omega_{t})^{T}\).


# Local Linear Trend
ss <- list()
ss <- bsts::AddLocalLinearTrend(ss, train.ts)
# Seasonal
ss <- bsts::AddSeasonal(ss, train.ts, nseasons = 12, season.duration = 1) 

BSTS.fit <- bsts(train.ts, state.specification = ss, niter = 1000, seed=100)  # niter : MCMC 반복
=-=-=-=-= Iteration 0 Thu May 19 02:28:39 2022
 =-=-=-=-=
=-=-=-=-= Iteration 100 Thu May 19 02:28:39 2022
 =-=-=-=-=
=-=-=-=-= Iteration 200 Thu May 19 02:28:39 2022
 =-=-=-=-=
=-=-=-=-= Iteration 300 Thu May 19 02:28:39 2022
 =-=-=-=-=
=-=-=-=-= Iteration 400 Thu May 19 02:28:39 2022
 =-=-=-=-=
=-=-=-=-= Iteration 500 Thu May 19 02:28:40 2022
 =-=-=-=-=
=-=-=-=-= Iteration 600 Thu May 19 02:28:40 2022
 =-=-=-=-=
=-=-=-=-= Iteration 700 Thu May 19 02:28:40 2022
 =-=-=-=-=
=-=-=-=-= Iteration 800 Thu May 19 02:28:40 2022
 =-=-=-=-=
=-=-=-=-= Iteration 900 Thu May 19 02:28:40 2022
 =-=-=-=-=
summary(BSTS.fit)
$residual.sd
[1] 38.96105

$prediction.sd
[1] 86.43389

$rsquare
[1] 0.9407548

$relative.gof
[1] 0.7354909

예측

burn          <- SuggestBurn(0.1, BSTS.fit)     # MCMC에서 버릴 갯수릴 갯수
BSTS.forecast <- predict(BSTS.fit, horizon = n.test, burn = burn,  quantiles = c(0.025, 0.975))  # horizon : the number of prediction
BSTS.forecast$mean
 [1] 2008.007 2034.247 2005.145 2114.927 2159.952 1849.643 1985.271
 [8] 1980.410 2025.083 1771.644 1738.847 2042.618 2047.263 2078.716
[15] 2049.774 2156.080 2205.664 1890.307 2028.975 2029.909 2066.173
[22] 1817.965 1788.446 2091.587 2098.778 2123.918 2100.609 2209.087
[29] 2258.041 1946.782 2080.985 2076.564 2118.590 1871.648 1843.989
[36] 2146.205
plot(BSTS.forecast, plot.original = 100)
accuracy(test.ts, BSTS.forecast$mean)
               ME     RMSE      MAE      MPE     MAPE
Test set 30.17445 66.89158 46.26838 1.502453 2.293063

모형 적합 with 예측 변수

## Month 변수 생성생성
xts(ridership.ts, order = as.Date(ridership.ts))
               [,1]
1991-01-01 1708.917
1991-02-01 1620.586
1991-03-01 1972.715
1991-04-01 1811.665
1991-05-01 1974.964
1991-06-01 1862.356
1991-07-01 1939.860
1991-08-01 2013.264
1991-09-01 1595.657
1991-10-01 1724.924
1991-11-01 1675.667
1991-12-01 1813.863
1992-01-01 1614.827
1992-02-01 1557.088
1992-03-01 1891.223
1992-04-01 1955.981
1992-05-01 1884.714
1992-06-01 1623.042
1992-07-01 1903.309
1992-08-01 1996.712
1992-09-01 1703.897
1992-10-01 1810.000
1992-11-01 1861.601
1992-12-01 1875.122
1993-01-01 1705.259
1993-02-01 1618.535
1993-03-01 1836.709
1993-04-01 1957.043
1993-05-01 1917.185
1993-06-01 1882.398
1993-07-01 1933.009
1993-08-01 1996.167
1993-09-01 1672.841
1993-10-01 1752.827
1993-11-01 1720.377
1993-12-01 1734.292
1994-01-01 1563.365
1994-02-01 1573.959
1994-03-01 1902.639
1994-04-01 1833.888
1994-05-01 1831.049
1994-06-01 1775.755
1994-07-01 1867.508
1994-08-01 1906.608
1994-09-01 1685.632
1994-10-01 1778.546
1994-11-01 1775.995
1994-12-01 1783.350
1995-01-01 1548.415
1995-02-01 1496.925
1995-03-01 1798.316
1995-04-01 1732.895
1995-05-01 1772.345
1995-06-01 1761.207
1995-07-01 1791.655
1995-08-01 1874.820
1995-09-01 1571.309
1995-10-01 1646.948
1995-11-01 1672.631
1995-12-01 1656.845
1996-01-01 1381.758
1996-02-01 1360.852
1996-03-01 1558.575
1996-04-01 1608.420
1996-05-01 1696.696
1996-06-01 1693.183
1996-07-01 1835.516
1996-08-01 1942.573
1996-09-01 1551.401
1996-10-01 1686.508
1996-11-01 1576.204
1996-12-01 1700.433
1997-01-01 1396.588
1997-02-01 1371.690
1997-03-01 1707.522
1997-04-01 1654.604
1997-05-01 1762.903
1997-06-01 1775.800
1997-07-01 1934.219
1997-08-01 2008.055
1997-09-01 1615.924
1997-10-01 1773.910
1997-11-01 1732.368
1997-12-01 1796.626
1998-01-01 1570.330
1998-02-01 1412.691
1998-03-01 1754.641
1998-04-01 1824.932
1998-05-01 1843.289
1998-06-01 1825.964
1998-07-01 1968.172
1998-08-01 1921.645
1998-09-01 1669.597
1998-10-01 1791.474
1998-11-01 1816.714
1998-12-01 1846.754
1999-01-01 1599.427
1999-02-01 1548.804
1999-03-01 1832.333
1999-04-01 1839.720
1999-05-01 1846.498
1999-06-01 1864.852
1999-07-01 1965.743
1999-08-01 1949.002
1999-09-01 1607.373
1999-10-01 1803.664
1999-11-01 1850.309
1999-12-01 1836.435
2000-01-01 1541.660
2000-02-01 1616.928
2000-03-01 1919.538
2000-04-01 1971.493
2000-05-01 1992.301
2000-06-01 2009.763
2000-07-01 2053.996
2000-08-01 2097.471
2000-09-01 1823.706
2000-10-01 1976.997
2000-11-01 1981.408
2000-12-01 2000.153
2001-01-01 1683.148
2001-02-01 1663.404
2001-03-01 2007.928
2001-04-01 2023.792
2001-05-01 2047.008
2001-06-01 2072.913
2001-07-01 2126.717
2001-08-01 2202.638
2001-09-01 1707.693
2001-10-01 1950.716
2001-11-01 1973.614
2001-12-01 1984.729
2002-01-01 1759.629
2002-02-01 1770.595
2002-03-01 2019.912
2002-04-01 2048.398
2002-05-01 2068.763
2002-06-01 1994.267
2002-07-01 2075.258
2002-08-01 2026.560
2002-09-01 1734.155
2002-10-01 1916.771
2002-11-01 1858.345
2002-12-01 1996.352
2003-01-01 1778.033
2003-02-01 1749.489
2003-03-01 2066.466
2003-04-01 2098.899
2003-05-01 2104.911
2003-06-01 2129.671
2003-07-01 2223.349
2003-08-01 2174.360
2003-09-01 1931.406
2003-10-01 2121.470
2003-11-01 2076.054
2003-12-01 2140.677
2004-01-01 1831.508
2004-02-01 1838.006
2004-03-01 2132.446
Month  <- as.Date(ridership.ts) %>%                  # Date 추출NAlubridate::month()                                 # Month 추출NATrain.Data <- data.frame("y"=train.ts, "Month"= Month[1:length(train.ts)])
Test.Data  <- data.frame("y"=test.ts, "Month"= Month[-(1:length(train.ts))])


BSTS.fit2 <- bsts(y ~ Month, state.specification = ss, data = Train.Data, niter = 1000, seed=100)  # niter : MCMC 반복
=-=-=-=-= Iteration 0 Thu May 19 02:28:41 2022
 =-=-=-=-=
=-=-=-=-= Iteration 100 Thu May 19 02:28:41 2022
 =-=-=-=-=
=-=-=-=-= Iteration 200 Thu May 19 02:28:42 2022
 =-=-=-=-=
=-=-=-=-= Iteration 300 Thu May 19 02:28:42 2022
 =-=-=-=-=
=-=-=-=-= Iteration 400 Thu May 19 02:28:42 2022
 =-=-=-=-=
=-=-=-=-= Iteration 500 Thu May 19 02:28:42 2022
 =-=-=-=-=
=-=-=-=-= Iteration 600 Thu May 19 02:28:43 2022
 =-=-=-=-=
=-=-=-=-= Iteration 700 Thu May 19 02:28:43 2022
 =-=-=-=-=
=-=-=-=-= Iteration 800 Thu May 19 02:28:43 2022
 =-=-=-=-=
=-=-=-=-= Iteration 900 Thu May 19 02:28:43 2022
 =-=-=-=-=
summary(BSTS.fit2)
$residual.sd
[1] 38.38387

$prediction.sd
[1] 86.37815

$rsquare
[1] 0.9424972

$relative.gof
[1] 0.7358398

$size
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.00000 0.07092 0.00000 1.00000 

$coefficients
                 mean       sd mean.inc   sd.inc   inc.prob
Month       0.1077952 0.539501 1.519912 1.414357 0.07092199
(Intercept) 0.0000000 0.000000 0.000000 0.000000 0.00000000

회귀 계수

# Ref. https://michaeldewittjr.com/programming/2018-07-05-bayesian-time-series-analysis-with-bsts_files/

burn2 <- SuggestBurn(0.1, BSTS.fit2)

### MCMC sample에 0인 값들 삭제하고 평균 내기 위한 함수NAPositiveMean <- function(b) {
  b <- b[abs(b) > 0]
  if (length(b) > 0) 
    return(mean(b))
  return(0)
}

### Get the average coefficients when variables were selected (non-zero slopes)
coeff <- data.frame(reshape2::melt(apply(BSTS.fit2$coefficients[-(1:burn2),], 2, PositiveMean)))  # Fun : mean (0 포함 평균)NAcoeff$Variable <- as.character(row.names(coeff))
coeff
               value    Variable
(Intercept) 0.000000 (Intercept)
Month       1.519912       Month
ggplot(data=coeff, aes(x=reorder(Variable,value), y=value)) + 
  coord_flip()+
  geom_bar(stat="identity", position="identity") + 
  theme(axis.text.x=element_text(angle = -90, hjust = 0)) +
  xlab("") + ylab("") + ggtitle("Average coefficients")

### Inclusion probabilities -- i.e., how often were the variables selected 
inclusionprobs <- reshape2::melt(colMeans(BSTS.fit2$coefficients[-(1:burn2),] != 0))
inclusionprobs$Variable <- as.character(row.names(inclusionprobs))
ggplot(data=inclusionprobs, aes(x=reorder(Variable, value), y=value)) + 
  geom_bar(stat="identity", position="identity") + 
  theme(axis.text.x=element_text(angle = -90, hjust = 0)) + 
  coord_flip()+
  xlab("") + ylab("") + ggtitle("Inclusion probabilities")


예측 with 예측 변수

# 예측
BSTS.forecast2 <- predict(BSTS.fit2, horizon = n.test, burn = burn2, newdata = Month[-(1:length(train.ts))], # newdata : 예측변수를 포함하는 변수
                          quantiles = c(0.025, 0.975))  # horizon : the number of prediction
BSTS.forecast2$mean
 [1] 2008.649 2033.859 2005.471 2115.656 2159.608 1849.589 1985.606
 [8] 1980.895 2025.518 1771.432 1738.900 2042.580 2046.658 2077.231
[15] 2049.163 2155.931 2204.486 1889.507 2028.428 2029.719 2066.013
[22] 1816.823 1787.502 2091.551 2097.854 2122.777 2100.560 2208.762
[29] 2256.867 1945.645 2080.120 2076.238 2118.645 1870.599 1843.099
[36] 2145.638
plot(BSTS.forecast2, plot.original = 100)
accuracy(BSTS.forecast2$mean, test.ts)
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -29.77804 66.58763 45.96603 -1.599523 2.379736 0.5265906
         Theil's U
Test set 0.3875899

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".