Machine Learning

Time Series

Machine Learning Application Method for Time Series Data

Yeongeun Jeon , Jung In Seo
09-21-2021

Introduction

트리 기반 머신 러닝 기법(Tree-based on Machine Learning Technique)을 시계열에 적용하는 방법에 대해 정리한다.


Application


Data 불러오기

pacman::p_load( "dplyr", "xts", 
                "caret", "caretEnsemble",
                "forecast", 
                "ggplot2")

library(doParallel)
library(parallel)

# cl <- makePSOCKcluster(detectCores())
# clusterEvalQ(cl, library(foreach))
# registerDoParallel(cores=cl)

registerDoParallel(cores=detectCores())

# In Mac
# guess_encoding("Amtrak.csv")
# Amtrak.data <- read.csv("Amtrak.csv", fileEncoding="EUC-KR")

Amtrak.data <- read.csv("C:/Users/User/Desktop/Amtrak.csv")
ridership.ts <- ts(Amtrak.data$Ridership, start=c(1991,1), end=c(2004,3), freq=12)

Data 분할

train.ts <- window(ridership.ts,start=c(1991,1), end=c(2001,3))
test.ts  <- window(ridership.ts,start=c(2001,4))
n.test    <- length(test.ts)

예측 변수 생성

# 1. Fourier Term
FT      <- fourier(train.ts, K=2)                    # K : sine, cosine 쌍의 개수/시계열 데이터의 계절 주기가 2개 이상일 때, K는 계절 주기 수만큼 필요NAFT.Test <- fourier(train.ts, K=2, h=n.test)

# 2. Month
xts(ridership.ts, order = as.Date(ridership.ts))
               [,1]
1991-01-01 1708.917
1991-02-01 1620.586
1991-03-01 1972.715
1991-04-01 1811.665
1991-05-01 1974.964
1991-06-01 1862.356
1991-07-01 1939.860
1991-08-01 2013.264
1991-09-01 1595.657
1991-10-01 1724.924
1991-11-01 1675.667
1991-12-01 1813.863
1992-01-01 1614.827
1992-02-01 1557.088
1992-03-01 1891.223
1992-04-01 1955.981
1992-05-01 1884.714
1992-06-01 1623.042
1992-07-01 1903.309
1992-08-01 1996.712
1992-09-01 1703.897
1992-10-01 1810.000
1992-11-01 1861.601
1992-12-01 1875.122
1993-01-01 1705.259
1993-02-01 1618.535
1993-03-01 1836.709
1993-04-01 1957.043
1993-05-01 1917.185
1993-06-01 1882.398
1993-07-01 1933.009
1993-08-01 1996.167
1993-09-01 1672.841
1993-10-01 1752.827
1993-11-01 1720.377
1993-12-01 1734.292
1994-01-01 1563.365
1994-02-01 1573.959
1994-03-01 1902.639
1994-04-01 1833.888
1994-05-01 1831.049
1994-06-01 1775.755
1994-07-01 1867.508
1994-08-01 1906.608
1994-09-01 1685.632
1994-10-01 1778.546
1994-11-01 1775.995
1994-12-01 1783.350
1995-01-01 1548.415
1995-02-01 1496.925
1995-03-01 1798.316
1995-04-01 1732.895
1995-05-01 1772.345
1995-06-01 1761.207
1995-07-01 1791.655
1995-08-01 1874.820
1995-09-01 1571.309
1995-10-01 1646.948
1995-11-01 1672.631
1995-12-01 1656.845
1996-01-01 1381.758
1996-02-01 1360.852
1996-03-01 1558.575
1996-04-01 1608.420
1996-05-01 1696.696
1996-06-01 1693.183
1996-07-01 1835.516
1996-08-01 1942.573
1996-09-01 1551.401
1996-10-01 1686.508
1996-11-01 1576.204
1996-12-01 1700.433
1997-01-01 1396.588
1997-02-01 1371.690
1997-03-01 1707.522
1997-04-01 1654.604
1997-05-01 1762.903
1997-06-01 1775.800
1997-07-01 1934.219
1997-08-01 2008.055
1997-09-01 1615.924
1997-10-01 1773.910
1997-11-01 1732.368
1997-12-01 1796.626
1998-01-01 1570.330
1998-02-01 1412.691
1998-03-01 1754.641
1998-04-01 1824.932
1998-05-01 1843.289
1998-06-01 1825.964
1998-07-01 1968.172
1998-08-01 1921.645
1998-09-01 1669.597
1998-10-01 1791.474
1998-11-01 1816.714
1998-12-01 1846.754
1999-01-01 1599.427
1999-02-01 1548.804
1999-03-01 1832.333
1999-04-01 1839.720
1999-05-01 1846.498
1999-06-01 1864.852
1999-07-01 1965.743
1999-08-01 1949.002
1999-09-01 1607.373
1999-10-01 1803.664
1999-11-01 1850.309
1999-12-01 1836.435
2000-01-01 1541.660
2000-02-01 1616.928
2000-03-01 1919.538
2000-04-01 1971.493
2000-05-01 1992.301
2000-06-01 2009.763
2000-07-01 2053.996
2000-08-01 2097.471
2000-09-01 1823.706
2000-10-01 1976.997
2000-11-01 1981.408
2000-12-01 2000.153
2001-01-01 1683.148
2001-02-01 1663.404
2001-03-01 2007.928
2001-04-01 2023.792
2001-05-01 2047.008
2001-06-01 2072.913
2001-07-01 2126.717
2001-08-01 2202.638
2001-09-01 1707.693
2001-10-01 1950.716
2001-11-01 1973.614
2001-12-01 1984.729
2002-01-01 1759.629
2002-02-01 1770.595
2002-03-01 2019.912
2002-04-01 2048.398
2002-05-01 2068.763
2002-06-01 1994.267
2002-07-01 2075.258
2002-08-01 2026.560
2002-09-01 1734.155
2002-10-01 1916.771
2002-11-01 1858.345
2002-12-01 1996.352
2003-01-01 1778.033
2003-02-01 1749.489
2003-03-01 2066.466
2003-04-01 2098.899
2003-05-01 2104.911
2003-06-01 2129.671
2003-07-01 2223.349
2003-08-01 2174.360
2003-09-01 1931.406
2003-10-01 2121.470
2003-11-01 2076.054
2003-12-01 2140.677
2004-01-01 1831.508
2004-02-01 1838.006
2004-03-01 2132.446
Month  <- as.Date(ridership.ts) %>%                  # Date 추출NAlubridate::month()                                 # Month 추출NA## 퓨리에 항과 합치기
Train.X <- cbind("Month"= Month[1:length(train.ts)], FT) 
Test.X  <- cbind("Month"= Month[-(1:length(train.ts))], FT.Test)  

분해

decomp.ts <- stl(train.ts, , s.window = "periodic", robust = TRUE)$time.series 
# decomp.ts <- mstl(Power.msts, s.window = "periodic", robust = TRUE) # 다중 계절성인 경우NA# Target without Trend
Target <- decomp.ts %>% 
  data.frame %>%
  rowwise() %>%                                        # 행별로 작업  dplyr::mutate(y=sum( seasonal, remainder )) %>%      # Target = Season + Remainder => Detrend
  dplyr::select(y)


Train.Data <- cbind(Target, Train.X)

머신 러닝 적용


Random Forest

set.seed(100)
fitControl <- trainControl(method = "adaptive_cv",   # cv, repeatedcv
                           number = 5,
                           repeats = 5,
                           adaptive = list(min = 5,
                                           alpha = 0.05,
                                           method = "BT",
                                           complete = TRUE),
                           search = "random",
                           allowParallel = TRUE) 

RF <- function(train, tuneLength, ntree = 500, nodesize = 5){
  
  set.seed(100)                                        # seed 고정 For Cross Validation
  caret.rf <- caret::train(y~., data = train, 
                           method = "parRF",           # Tune Parameter : mtry
                           trControl = fitControl,
                           tuneLength = tuneLength,   
                           ntree = ntree,             
                           nodesize = nodesize,        # nodesize : Terminal Node의 최소 크기NA= TRUE)   
  
  return(caret.rf)
  
}

RF.Caret <- RF(Train.Data, 
               tuneLength = 2,     # tuneLength (탐색할 후보 모수 갯수)
               ntree = 100)        # ntree : 생성할 Tree 수

RF.Caret
Parallel Random Forest 

123 samples
  5 predictor

No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ... 
Resampling results across tuning parameters:

  mtry  RMSE      Rsquared   MAE       Resamples
  1     52.87737  0.8892022  40.21114  25       
  3     50.21355  0.8949560  38.53876  25       

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 3.
RF.Caret$finalModel

Call:
 randomForest(x = "x", y = "y", ntree = 13, mtry = 3L, nodesize = 5,      importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 104
No. of variables tried at each split: 3
RF.Caret$finalModel$tuneValue
  mtry
2    3

XGBoost

set.seed(100)
fitControl <- trainControl(method = "adaptive_cv",   # cv, repeatedcv
                           number = 5,
                           repeats = 5,
                           adaptive = list(min = 5,
                                           alpha = 0.05,
                                           method = "BT",
                                           complete = TRUE),
                           search = "random",
                           allowParallel = TRUE) 

XGBoost <- function(train, tuneLength){
  
  set.seed(100)                                         # seed 고정 For Cross Validation
  caret.xgb <- caret::train(y~., data = train, 
                            method = "xgbTree",          
                            trControl = fitControl,
                            # objective = "reg:squarederror", # error(The following parameters were provided multiple times)
                            tuneLength = tuneLength     # tuneLength (탐색할 후보 모수 갯수)
  )   
  
  return(caret.xgb)
  
}

XGB.Caret <- XGBoost(Train.Data, 2)
XGB.Caret
eXtreme Gradient Boosting 

123 samples
  5 predictor

No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ... 
Resampling results across tuning parameters:

  eta        max_depth  gamma     colsample_bytree  min_child_weight
  0.2228220  9          1.702621  0.6528662         3               
  0.4876292  3          5.465586  0.5499986         5               
  subsample  nrounds  RMSE      Rsquared   MAE       Resamples
  0.7517663  624      50.21422  0.8931412  38.60326  25       
  0.8219133  358      50.06911  0.8961369  38.41744  25       

RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 358, max_depth
 = 3, eta = 0.4876292, gamma = 5.465586, colsample_bytree
 = 0.5499986, min_child_weight = 5 and subsample = 0.8219133.
XGB.Caret$finalModel
##### xgb.Booster
raw: 181.9 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
  eta = "0.48762916797353", max_depth = "3", gamma = "5.46558595029637", colsample_bytree = "0.549998590815812", min_child_weight = "5", subsample = "0.82191331172362", objective = "reg:linear", silent = "1"
xgb.attributes:
  niter
callbacks:
  cb.print.evaluation(period = print_every_n)
# of features: 5 
niter: 358
nfeatures : 5 
xNames : Month `S1-12` `C1-12` `S2-12` `C2-12` 
problemType : Regression 
tuneValue :
      nrounds max_depth       eta    gamma colsample_bytree
2     358         3 0.4876292 5.465586        0.5499986
  min_child_weight subsample
2                5 0.8219133
obsLevels : NA 
param :
    list()
XGB.Caret$finalModel$tuneValue
  nrounds max_depth       eta    gamma colsample_bytree
2     358         3 0.4876292 5.465586        0.5499986
  min_child_weight subsample
2                5 0.8219133

Stacking

# Ref. https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html
#      https://github.com/zachmayer/caretEnsemble/blob/master/R/caretStack.R


# 1. Modeling for Stacking (Declare Individual Model)
set.seed(100)
fitControl <- trainControl(method = "repeatedcv",        # adaptive_cv
                           number = 5,
                           repeats = 5,
                           # adaptive = list(min = 5,
                           #                 alpha = 0.05,
                           #                 method = "BT",
                           #                 complete = TRUE),
                           # search = "random",            # grid
                           savePredictions = "final",      # 최적 모수에 대한 예측 저장NA# classProbs = TRUE,            # 각 클래스에 대한 확률 저장(Classification))
                           index = createResample(Train.Data$y, 1),  # index : 각 resapling에 대한 요소, Training에 사용되는 행 번호/ createResample : 붓스트랩스트랩
                           allowParallel = TRUE) 


# 원본 Training Data에 학습시킨 Hyperparameter 결과alg_tune_list <- list(                            # Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
  parRF = caretModelSpec(method="parRF",
                         importance = TRUE,
                         nodeside = 5,
                         tuneGrid = expand.grid(mtry=RF.Caret$finalModel$tuneValue$mtry)),
  xgbTree = caretModelSpec(method="xgbTree",
                           tuneGrid = expand.grid(nrounds = XGB.Caret$finalModel$tuneValue$nrounds,
                                                  max_depth = XGB.Caret$finalModel$tuneValue$max_depth,
                                                  eta = XGB.Caret$finalModel$tuneValue$eta,
                                                  gamma = XGB.Caret$finalModel$tuneValue$gamma,
                                                  colsample_bytree = XGB.Caret$finalModel$tuneValue$colsample_bytree,
                                                  min_child_weight = XGB.Caret$finalModel$tuneValue$min_child_weight,
                                                  subsample = XGB.Caret$finalModel$tuneValue$subsample)))

set.seed(100)
multi_mod <- caretList(y~., data = Train.Data, trControl = fitControl, 
                       tuneList = alg_tune_list)  # search = "grid"     


multi_mod$parRF
Parallel Random Forest 

123 samples
  5 predictor

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 123 
Resampling results:

  RMSE      Rsquared   MAE    
  55.73908  0.8368916  42.4932

Tuning parameter 'mtry' was held constant at a value of 3
multi_mod$xgbTree
eXtreme Gradient Boosting 

123 samples
  5 predictor

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 123 
Resampling results:

  RMSE      Rsquared  MAE     
  58.30165  0.827723  45.50672

Tuning parameter 'nrounds' was held constant at a value of 358
 held constant at a value of 5
Tuning parameter 'subsample' was
 held constant at a value of 0.8219133
# 2. Stacking (개별 모형들의 예측값을 결합한 Data를 Training data로 쓰는 Final Model))

set.seed(100)
stackControl <- trainControl(method = "adaptive_cv",
                             number = 5,
                             repeats = 5,
                             adaptive = list(min = 5,
                                             alpha = 0.05,
                                             method = "BT",
                                             complete = TRUE),
                             search = "random",
                             allowParallel = TRUE) 

set.seed(100)
stack.xgb <- caretStack(multi_mod, method = "xgbTree",  # Final Model
                        trControl = stackControl, 
                        tuneLength = 2)                 # 모수 후보 갯수stack.xgb
A xgbTree ensemble of 2 base models: parRF, xgbTree

Ensemble results:
eXtreme Gradient Boosting 

38 samples
 2 predictor

No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times) 
Summary of sample sizes: 30, 31, 31, 30, 30, 30, ... 
Resampling results across tuning parameters:

  eta        max_depth  gamma     colsample_bytree  min_child_weight
  0.2228220  9          1.702621  0.6528662         3               
  0.4876292  3          5.465586  0.5499986         5               
  subsample  nrounds  RMSE      Rsquared  MAE       Resamples
  0.7517663  624      48.62987  0.929678  38.75318  25       
  0.8219133  358      57.27670  0.924131  43.13093  25       

RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 624, max_depth
 = 9, eta = 0.222822, gamma = 1.702621, colsample_bytree =
 0.6528662, min_child_weight = 3 and subsample = 0.7517663.
stack.xgb$ens_model$finalModel
##### xgb.Booster
raw: 357.5 Kb 
call:
  xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth, 
    gamma = param$gamma, colsample_bytree = param$colsample_bytree, 
    min_child_weight = param$min_child_weight, subsample = param$subsample), 
    data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
  eta = "0.222822001739405", max_depth = "9", gamma = "1.70262051047757", colsample_bytree = "0.652866207249463", min_child_weight = "3", subsample = "0.751766284287442", objective = "reg:linear", silent = "1"
xgb.attributes:
  niter
callbacks:
  cb.print.evaluation(period = print_every_n)
# of features: 2 
niter: 624
nfeatures : 2 
xNames : parRF xgbTree 
problemType : Regression 
tuneValue :
      nrounds max_depth      eta    gamma colsample_bytree
1     624         9 0.222822 1.702621        0.6528662
  min_child_weight subsample
1                3 0.7517663
obsLevels : NA 
param :
    list()
stack.xgb$ens_model$finalModel$tuneValue
  nrounds max_depth      eta    gamma colsample_bytree
1     624         9 0.222822 1.702621        0.6528662
  min_child_weight subsample
1                3 0.7517663

Stacking with GLM

\[ \begin{aligned} \hat{Y} = \beta_{0} + \sum^{m}_{i=1} \hat{y}_{i} \end{aligned} \]

set.seed(100)
greedyEnsemble <- caretEnsemble(multi_mod, trControl = trainControl(method = "cv", number=5))
greedyEnsemble
A glm ensemble of 2 base models: parRF, xgbTree

Ensemble results:
Generalized Linear Model 

38 samples
 2 predictor

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 30, 31, 31, 30, 30 
Resampling results:

  RMSE      Rsquared   MAE     
  58.25038  0.8224663  45.03724
greedyEnsemble$ens_model$finalModel

Call:  NULL

Coefficients:
(Intercept)        parRF      xgbTree  
   -8.46746      0.93423      0.02958  

Degrees of Freedom: 37 Total (i.e. Null);  35 Residual
Null Deviance:      703200 
Residual Deviance: 114700   AIC: 420.3
summary(greedyEnsemble)
The following models were ensembled: parRF, xgbTree 
They were weighted: 
-8.4675 0.9342 0.0296
The resulting RMSE is: 58.2504
The fit for each individual model on the RMSE is: 
  method     RMSE RMSESD
   parRF 55.73908     NA
 xgbTree 58.30165     NA

변수 중요도
VI <- varImp(greedyEnsemble) 
varImp(greedyEnsemble)
         overall    parRF  xgbTree
`C2-12`  0.00000  0.00000  0.00000
`S1-12` 11.09090 10.66833 24.43856
`C1-12` 13.65903 13.66554 13.45348
`S2-12` 20.28970 20.52867 12.74124
Month   54.96037 55.13746 49.36673
VI$x <- as.character(row.names(VI))
VI2 <- reshape2::melt(VI, id.vars="x",
                      variable.name="Type",
                      value.name='VI')

ggplot(VI2, aes(x=reorder(x, VI), y=VI)) + 
  geom_bar(stat="identity", position=position_dodge(), show.legend = F) + 
  facet_wrap(.~ Type, nrow=1) +
  xlab("") +      
  ylab("") +
  coord_flip() +
  theme_bw() +
  theme(axis.text.x = element_text(size = 13),  # angle = 45, vjust = 1, hjust = 1
        axis.text.y = element_text(size = 13),
        axis.title = element_blank(),
        plot.title = element_blank(),
        legend.title = element_blank(),
        strip.text.x = element_text(size=15, color="black"))


예측


추세 예측

# Trend 
trend.part        <- data.frame(decomp.ts)$trend %>% # Only Trend of Training Data
  ts()

# Fitting ARIMA for Trend 
trend.fit.arima   <- auto.arima(trend.part)

# Forecast 
trend.arima.pred  <- forecast(trend.fit.arima, n.test)
trend.arima.pred$mean 
Time Series:
Start = 124 
End = 159 
Frequency = 1 
 [1] 1963.065 1966.790 1970.428 1974.067 1977.679 1981.291 1984.895
 [8] 1988.498 1992.099 1995.700 1999.301 2002.901 2006.501 2010.101
[15] 2013.701 2017.301 2020.901 2024.501 2028.101 2031.701 2035.301
[22] 2038.901 2042.501 2046.101 2049.701 2053.301 2056.901 2060.501
[29] 2064.101 2067.701 2071.301 2074.901 2078.501 2082.101 2085.701
[36] 2089.301

최종 예측

# 최종 예측(Seasonal + Remainder + Trend)Pred.RF        <- predict(RF.Caret, Test.X) + trend.arima.pred$mean   
Pred.XGB       <- predict(XGB.Caret, Test.X) + trend.arima.pred$mean
stack.Pred.XGB <- predict(stack.xgb, Test.X) + trend.arima.pred$mean
stack.Pred.GLM <- predict(greedyEnsemble, Test.X) + trend.arima.pred$mean
# Accuracy
acc_RF        <- accuracy(c(Pred.RF), test.ts)
acc_XGB       <- accuracy(c(Pred.XGB), test.ts)
acc_stack.XGB <- accuracy(c(stack.Pred.XGB), test.ts)
acc_stack.GLM <- accuracy(c(stack.Pred.GLM), test.ts)

acc_RF
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -29.23913 71.90871 46.82079 -1.546831 2.423727 0.6008866
         Theil's U
Test set 0.4190109
acc_XGB
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -27.27012 68.20048 47.95543 -1.475506 2.477208 0.5023489
         Theil's U
Test set 0.3950394
acc_stack.XGB
                ME     RMSE      MAE       MPE     MAPE      ACF1
Test set -29.89315 88.01831 65.54621 -1.617441 3.344385 0.3068341
         Theil's U
Test set  0.512367
acc_stack.GLM
                ME    RMSE      MAE      MPE    MAPE      ACF1
Test set -19.94278 66.9187 44.73519 -1.09601 2.30595 0.5677311
         Theil's U
Test set  0.388657

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".