Regression based on forecast

Time Series

Regression based on Forecast in Time Series Data

Yeongeun Jeon , Jung In Seo
05-28-2020
Data Mining for Business Analytics을 공부하면서 요약하였다.

1 추세를 가진 모형

pacman::p_load("data.table",
               "forecast")

# Data 불러오기Amtrak.data <- fread(paste(getwd(),"Amtrak.csv", sep="/"))

# Create time series

ridership.ts <- ts(Amtrak.data$Ridership, start=c(1991,1), end=c(2004,3), freq=12)
plot(ridership.ts, xlab="Time", ylab="Ridership", ylim=c(1300,2300), bty="l")

선형 추세

# Decompose train data and validation data

train.ts <- window(ridership.ts,start=c(1991,1), end=c(2001,3))
valid.ts <- window(ridership.ts,start=c(2001,4))
nValid   <- length(valid.ts)

# Fit linear trend model to training set and create forecasts

train.lm      <- tslm(train.ts ~ trend)
train.lm.pred <- forecast(train.lm, h=nValid, level=0)   # h : Number of periods for forecasting / level : Confidence level for prediction intervals.
# Plot

plot(train.lm.pred, ylim=c(1300,2600), ylab="Ridership", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="", flty=2)
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1)))     
lines(train.lm.pred$fitted, lwd=2, col="blue")     
lines(valid.ts)
plot(train.lm.pred$residuals, ylim=c(-420,500), ylab="Residual", xlab="Time", bty="l", xaxt="n", xlim=c(1996,2006.25), main="")
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1)))
lines(valid.ts-train.lm.pred$mean, lwd=1)

summary(train.lm)

Call:
tslm(formula = train.ts ~ trend)

Residuals:
    Min      1Q  Median      3Q     Max 
-411.29 -114.02   16.06  129.28  306.35 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 1750.3595    29.0729  60.206   <2e-16 ***
trend          0.3514     0.4069   0.864     0.39    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 160.2 on 121 degrees of freedom
Multiple R-squared:  0.006125,  Adjusted R-squared:  -0.002089 
F-statistic: 0.7456 on 1 and 121 DF,  p-value: 0.3896

지수 추세

# Fit exponential trend using tslm() with argument lambda=0

train.lm.expo.trend      <- tslm(train.ts ~ trend, lambda=0)
train.lm.expo.trend.pred <- forecast(train.lm.expo.trend, h=nValid, level=0)

# Fit linear trend using tslm() with argument lambda = 1 (No transform of y)

train.lm.linear.trend      <- tslm(train.ts ~ trend, lambda=1)
train.lm.linear.trend.pred <- forecast(train.lm.linear.trend, h=nValid, level=0)
# Plot

plot(train.lm.expo.trend.pred, ylim=c(1300,2600), ylab="Ridership", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="", flty=2)
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(train.lm.expo.trend.pred$fitted, lwd=2, col="blue") 
lines(train.lm.linear.trend.pred$fitted, lwd=2, col="black", lty=3)
lines(train.lm.linear.trend.pred$mean, lwd=2, col="black", lty=3)
lines(train.ts)
lines(valid.ts)

다항 추세

# Fit quadratic trend using function I(), which trats an object "as is"

train.lm.poly.trend      <- tslm(train.ts ~ trend +  I(trend^2))
train.lm.poly.trend.pred <- forecast(train.lm.poly.trend, h=nValid, level=0)
plot(train.lm.poly.trend.pred, ylim=c(1300,2600), ylab="Ridership", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="", flty=2)
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(train.lm.poly.trend.pred$fitted, lwd=2)
lines(valid.ts)

plot(train.lm.poly.trend.pred$residuals,  ylim=c(-420,500), ylab="Residual", xlab="Time", bty="l", xaxt="n", xlim=c(1996,2006.25), main="")
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(valid.ts-train.lm.poly.trend.pred$mean, lwd=1)

2 계절성을 가진 모형

Additive 계절성

# Include season as a predictor in tslm(). Here it creates 11 dummies,
# One for each month except for the first season, January.

train.lm.season      <- tslm(train.ts ~  season)     # 참조변수 = Season 1
train.lm.season.pred <- forecast(train.lm.season, h=nValid, level=0)

summary(train.lm.season)

Call:
tslm(formula = train.ts ~ season)

Residuals:
     Min       1Q   Median       3Q      Max 
-276.165  -52.934    5.868   54.544  215.081 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1573.97      30.58  51.475  < 2e-16 ***
season2       -42.93      43.24  -0.993   0.3230    
season3       260.77      43.24   6.030 2.19e-08 ***
season4       245.09      44.31   5.531 2.14e-07 ***
season5       278.22      44.31   6.279 6.81e-09 ***
season6       233.46      44.31   5.269 6.82e-07 ***
season7       345.33      44.31   7.793 3.79e-12 ***
season8       396.66      44.31   8.952 9.19e-15 ***
season9        75.76      44.31   1.710   0.0901 .  
season10      200.61      44.31   4.527 1.51e-05 ***
season11      192.36      44.31   4.341 3.14e-05 ***
season12      230.42      44.31   5.200 9.18e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 101.4 on 111 degrees of freedom
Multiple R-squared:  0.6348,    Adjusted R-squared:  0.5986 
F-statistic: 17.54 on 11 and 111 DF,  p-value: < 2.2e-16
plot(train.lm.season.pred, ylim=c(1300,2600), ylab="Ridership", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="", flty=2)
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(train.lm.season.pred$fitted, lwd=2)
lines(valid.ts)

plot(train.lm.season.pred$residuals,  ylim=c(-420,500), ylab="Residual", xlab="Time", bty="l", xaxt="n", xlim=c(1996,2006.25), main="")
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(valid.ts-train.lm.season.pred$mean, lwd=1)

Multiplicative 계절성

# Multiplicative seasonality

train.expo.lm.season      <- tslm(train.ts ~  season, lambda=0)
train.expo.lm.season.pred <- forecast(train.expo.lm.season, h=nValid, level=0)

# Plot

plot(train.expo.lm.season.pred, ylim=c(1300,2600), ylab="Ridership", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="", flty=2)
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(train.expo.lm.season.pred$fitted, lwd=2)

3 추세와 계절성을 가진 모형

train.lm.trend.season      <- tslm(train.ts ~ trend + I(trend^2) + season)
train.lm.trend.season.pred <- forecast(train.lm.trend.season, h=nValid, level=0)

summary(train.lm.trend.season)

Call:
tslm(formula = train.ts ~ trend + I(trend^2) + season)

Residuals:
     Min       1Q   Median       3Q      Max 
-213.775  -39.363    9.711   42.422  152.187 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.697e+03  2.768e+01  61.318  < 2e-16 ***
trend       -7.156e+00  7.293e-01  -9.812  < 2e-16 ***
I(trend^2)   6.074e-02  5.698e-03  10.660  < 2e-16 ***
season2     -4.325e+01  3.024e+01  -1.430  0.15556    
season3      2.600e+02  3.024e+01   8.598 6.60e-14 ***
season4      2.606e+02  3.102e+01   8.401 1.83e-13 ***
season5      2.938e+02  3.102e+01   9.471 6.89e-16 ***
season6      2.490e+02  3.102e+01   8.026 1.26e-12 ***
season7      3.606e+02  3.102e+01  11.626  < 2e-16 ***
season8      4.117e+02  3.102e+01  13.270  < 2e-16 ***
season9      9.032e+01  3.102e+01   2.911  0.00437 ** 
season10     2.146e+02  3.102e+01   6.917 3.29e-10 ***
season11     2.057e+02  3.103e+01   6.629 1.34e-09 ***
season12     2.429e+02  3.103e+01   7.829 3.44e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 70.92 on 109 degrees of freedom
Multiple R-squared:  0.8246,    Adjusted R-squared:  0.8037 
F-statistic: 39.42 on 13 and 109 DF,  p-value: < 2.2e-16
plot(train.lm.trend.season.pred, ylim=c(1300,2600), ylab="Ridership", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="", flty=2)
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(train.lm.trend.season.pred$fitted, lwd=2)
lines(valid.ts)

plot(train.lm.trend.season.pred$residuals,  ylim=c(-420,500), ylab="Residual", xlab="Time", bty="l", xaxt="n", xlim=c(1996,2006.25), main="")
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(valid.ts-train.lm.trend.season.pred$mean, lwd=1)

4 자기상관과 ARIMA 모형

자기상관

reidership.24.ts <- window(train.ts, start=c(1991,1), end=c(1991,24) )
Acf(reidership.24.ts, lag.max=12, main="")

잔차의 자기상관

Acf(train.lm.trend.season$residuals, lag.max = 12, main="")

통합된 자기상관 정보에 의한 예측 향상

잔차에 대한 2차 예측모형 구축

# Fit linear regression with quadratic tren and seasonality to Ridership 

train.lm.trend.season <- tslm(train.ts ~ trend + I(trend^2) + season)    # 관측값에 대한 예측모형
# Fit AR(1) model to training residuals
# Use Arima() in the forecast package to fit an ARIMA model

train.res.arima      <- Arima(train.lm.trend.season$residuals, order=c(1,0,0))   # 잔차에 AR(1) 모형 적합NAtrain.res.arima.pred <- forecast(train.res.arima, h=nValid, level=0)             # 적합된 AR(1) 모형으로 미래 잔차 예측NAsummary(train.res.arima)
Series: train.lm.trend.season$residuals 
ARIMA(1,0,0) with non-zero mean 

Coefficients:
         ar1     mean
      0.5998   0.3728
s.e.  0.0712  11.8408

sigma^2 estimated as 2876:  log likelihood=-663.54
AIC=1333.08   AICc=1333.29   BIC=1341.52

Training set error measures:
                     ME     RMSE     MAE      MPE     MAPE      MASE
Training set -0.1223159 53.19141 39.8277 103.3885 175.2219 0.5721356
                    ACF1
Training set -0.07509305
# Plot

plot(train.lm.trend.season$residuals, ylim=c(-250,250), ylab="Residual", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="")
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1)))
lines(train.res.arima.pred$fitted, lwd=2, col="blue")

valid.res.arima.pred <- forecast(train.res.arima, h=1)
valid.res.arima.pred
         Point Forecast     Lo 80   Hi 80     Lo 95    Hi 95
Apr 2001        7.41111 -61.31748 76.1397 -97.70019 112.5224
forecast(train.lm.trend.season, h=1)
         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
Apr 2001       2004.271 1905.227 2103.315 1852.024 2156.518
Acf(train.res.arima$residuals, lag.max = 12)

# Improved forecast 

res.arima         <- as.data.frame(train.res.arima.pred)[,1]
fore              <- as.data.frame(forecast(train.lm.trend.season, h=nValid, level=0 ))[,1]
Improved_forecast <- apply(cbind(res.arima,fore), 1,sum)   # Data.frame 변환Improved_forecast
 [1] 2011.682 2050.014 2011.580 2130.452 2189.195 1875.951 2008.596
 [8] 2008.231 2054.230 1820.195 1785.984 2098.412 2108.310 2150.910
[15] 2115.620 2236.961 2297.769 1986.346 2120.668 2121.891 2169.427
[22] 1936.896 1904.171 2218.074 2229.440 2273.504 2239.676 2362.476
[29] 2424.743 2114.779 2250.559 2253.241 2302.235 2071.162 2039.895
[36] 2355.256
# Plot

Improved_forecast <- ts(Improved_forecast, start=c(2001,4), end=c(2004,3), freq=12) # Time series 변환plot(train.lm.trend.season.pred, ylim=c(1300,2600), ylab="Ridership", xlab="Time", bty="l", xaxt="n", xlim=c(1991,2006.25), main="")
axis(1, at=seq(1991,2006,1), labels=format(seq(1991,2006,1))) 
lines(Improved_forecast, lwd=1, col="orange")
lines(valid.ts)

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".