Machine Learning Application Method for Time Series Data
트리 기반 머신 러닝 기법(Tree-based on Machine Learning Technique)을 시계열에 적용하는 방법에 대해 정리한다.
이상치에 예민하다는 단점
이 있다.
트리 기반 머신러닝 기법은 추세를 포착할 수 없다.
분해(Decomposition)
을 이용하여, 추세(Trend)가 제거된 Detrend Data에 트리 기반 머신 러닝 기법을 적용
해야한다.
최종 예측 = 머신 러닝의 예측값 + 추세의 예측값
pacman::p_load( "dplyr", "xts",
"caret", "caretEnsemble",
"forecast",
"ggplot2")
library(doParallel)
library(parallel)
# cl <- makePSOCKcluster(detectCores())
# clusterEvalQ(cl, library(foreach))
# registerDoParallel(cores=cl)
registerDoParallel(cores=detectCores())
# In Mac
# guess_encoding("Amtrak.csv")
# Amtrak.data <- read.csv("Amtrak.csv", fileEncoding="EUC-KR")
Amtrak.data <- read.csv("C:/Users/User/Desktop/Amtrak.csv")
ridership.ts <- ts(Amtrak.data$Ridership, start=c(1991,1), end=c(2004,3), freq=12)
# 1. Fourier Term
FT <- fourier(train.ts, K=2) # K : sine, cosine 쌍의 개수/시계열 데이터의 계절 주기가 2개 이상일 때, K는 계절 주기 수만큼 필요NAFT.Test <- fourier(train.ts, K=2, h=n.test)
# 2. Month
xts(ridership.ts, order = as.Date(ridership.ts))
[,1]
1991-01-01 1708.917
1991-02-01 1620.586
1991-03-01 1972.715
1991-04-01 1811.665
1991-05-01 1974.964
1991-06-01 1862.356
1991-07-01 1939.860
1991-08-01 2013.264
1991-09-01 1595.657
1991-10-01 1724.924
1991-11-01 1675.667
1991-12-01 1813.863
1992-01-01 1614.827
1992-02-01 1557.088
1992-03-01 1891.223
1992-04-01 1955.981
1992-05-01 1884.714
1992-06-01 1623.042
1992-07-01 1903.309
1992-08-01 1996.712
1992-09-01 1703.897
1992-10-01 1810.000
1992-11-01 1861.601
1992-12-01 1875.122
1993-01-01 1705.259
1993-02-01 1618.535
1993-03-01 1836.709
1993-04-01 1957.043
1993-05-01 1917.185
1993-06-01 1882.398
1993-07-01 1933.009
1993-08-01 1996.167
1993-09-01 1672.841
1993-10-01 1752.827
1993-11-01 1720.377
1993-12-01 1734.292
1994-01-01 1563.365
1994-02-01 1573.959
1994-03-01 1902.639
1994-04-01 1833.888
1994-05-01 1831.049
1994-06-01 1775.755
1994-07-01 1867.508
1994-08-01 1906.608
1994-09-01 1685.632
1994-10-01 1778.546
1994-11-01 1775.995
1994-12-01 1783.350
1995-01-01 1548.415
1995-02-01 1496.925
1995-03-01 1798.316
1995-04-01 1732.895
1995-05-01 1772.345
1995-06-01 1761.207
1995-07-01 1791.655
1995-08-01 1874.820
1995-09-01 1571.309
1995-10-01 1646.948
1995-11-01 1672.631
1995-12-01 1656.845
1996-01-01 1381.758
1996-02-01 1360.852
1996-03-01 1558.575
1996-04-01 1608.420
1996-05-01 1696.696
1996-06-01 1693.183
1996-07-01 1835.516
1996-08-01 1942.573
1996-09-01 1551.401
1996-10-01 1686.508
1996-11-01 1576.204
1996-12-01 1700.433
1997-01-01 1396.588
1997-02-01 1371.690
1997-03-01 1707.522
1997-04-01 1654.604
1997-05-01 1762.903
1997-06-01 1775.800
1997-07-01 1934.219
1997-08-01 2008.055
1997-09-01 1615.924
1997-10-01 1773.910
1997-11-01 1732.368
1997-12-01 1796.626
1998-01-01 1570.330
1998-02-01 1412.691
1998-03-01 1754.641
1998-04-01 1824.932
1998-05-01 1843.289
1998-06-01 1825.964
1998-07-01 1968.172
1998-08-01 1921.645
1998-09-01 1669.597
1998-10-01 1791.474
1998-11-01 1816.714
1998-12-01 1846.754
1999-01-01 1599.427
1999-02-01 1548.804
1999-03-01 1832.333
1999-04-01 1839.720
1999-05-01 1846.498
1999-06-01 1864.852
1999-07-01 1965.743
1999-08-01 1949.002
1999-09-01 1607.373
1999-10-01 1803.664
1999-11-01 1850.309
1999-12-01 1836.435
2000-01-01 1541.660
2000-02-01 1616.928
2000-03-01 1919.538
2000-04-01 1971.493
2000-05-01 1992.301
2000-06-01 2009.763
2000-07-01 2053.996
2000-08-01 2097.471
2000-09-01 1823.706
2000-10-01 1976.997
2000-11-01 1981.408
2000-12-01 2000.153
2001-01-01 1683.148
2001-02-01 1663.404
2001-03-01 2007.928
2001-04-01 2023.792
2001-05-01 2047.008
2001-06-01 2072.913
2001-07-01 2126.717
2001-08-01 2202.638
2001-09-01 1707.693
2001-10-01 1950.716
2001-11-01 1973.614
2001-12-01 1984.729
2002-01-01 1759.629
2002-02-01 1770.595
2002-03-01 2019.912
2002-04-01 2048.398
2002-05-01 2068.763
2002-06-01 1994.267
2002-07-01 2075.258
2002-08-01 2026.560
2002-09-01 1734.155
2002-10-01 1916.771
2002-11-01 1858.345
2002-12-01 1996.352
2003-01-01 1778.033
2003-02-01 1749.489
2003-03-01 2066.466
2003-04-01 2098.899
2003-05-01 2104.911
2003-06-01 2129.671
2003-07-01 2223.349
2003-08-01 2174.360
2003-09-01 1931.406
2003-10-01 2121.470
2003-11-01 2076.054
2003-12-01 2140.677
2004-01-01 1831.508
2004-02-01 1838.006
2004-03-01 2132.446
mstl()
을 이용할 수 있다.계절성과 나머지 성분을 더한 값을 Target으로 한 Training Dataset
을 생성한다.decomp.ts <- stl(train.ts, , s.window = "periodic", robust = TRUE)$time.series
# decomp.ts <- mstl(Power.msts, s.window = "periodic", robust = TRUE) # 다중 계절성인 경우NA# Target without Trend
Target <- decomp.ts %>%
data.frame %>%
rowwise() %>% # 행별로 작업 dplyr::mutate(y=sum( seasonal, remainder )) %>% # Target = Season + Remainder => Detrend
dplyr::select(y)
Train.Data <- cbind(Target, Train.X)
caret
package를 이용하여, 가장 대표적인 랜덤포레스트(Random Forest)
와 eXtreme Gradient Boosting(XGBoost)
를 적용해보았다.
caretEnsemble
package를 이용하여, 앙상블(Ensemble) 기법 Stacking
도 적용해보았다.mtry
로 트리가 분할될 때 랜덤적으로 선택되는 후보군 예측 변수 갯수이다.set.seed(100)
fitControl <- trainControl(method = "adaptive_cv", # cv, repeatedcv
number = 5,
repeats = 5,
adaptive = list(min = 5,
alpha = 0.05,
method = "BT",
complete = TRUE),
search = "random",
allowParallel = TRUE)
RF <- function(train, tuneLength, ntree = 500, nodesize = 5){
set.seed(100) # seed 고정 For Cross Validation
caret.rf <- caret::train(y~., data = train,
method = "parRF", # Tune Parameter : mtry
trControl = fitControl,
tuneLength = tuneLength,
ntree = ntree,
nodesize = nodesize, # nodesize : Terminal Node의 최소 크기NA= TRUE)
return(caret.rf)
}
RF.Caret <- RF(Train.Data,
tuneLength = 2, # tuneLength (탐색할 후보 모수 갯수)
ntree = 100) # ntree : 생성할 Tree 수
RF.Caret
Parallel Random Forest
123 samples
5 predictor
No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE Resamples
1 52.87737 0.8892022 40.21114 25
3 50.21355 0.8949560 38.53876 25
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 3.
RF.Caret$finalModel
Call:
randomForest(x = "x", y = "y", ntree = 13, mtry = 3L, nodesize = 5, importance = TRUE)
Type of random forest: regression
Number of trees: 104
No. of variables tried at each split: 3
RF.Caret$finalModel$tuneValue
mtry
2 3
nrounds
: 반복 수max_depth
: Tree의 최대 깊이eta
: Learning Lategamma
: 분할하기 위해 필요한 최소 손실 감소, 클수록 분할이 쉽게 일어나지 않음colsample_bytree
: Tree 생성 때 사용할 예측변수 비율min_child_weight
: 한 leaf 노드에 요구되는 관측치에 대한 가중치의 최소 합subsample
: 모델 구축시 사용할 Data비율로 1이면 전체 Data 사용set.seed(100)
fitControl <- trainControl(method = "adaptive_cv", # cv, repeatedcv
number = 5,
repeats = 5,
adaptive = list(min = 5,
alpha = 0.05,
method = "BT",
complete = TRUE),
search = "random",
allowParallel = TRUE)
XGBoost <- function(train, tuneLength){
set.seed(100) # seed 고정 For Cross Validation
caret.xgb <- caret::train(y~., data = train,
method = "xgbTree",
trControl = fitControl,
# objective = "reg:squarederror", # error(The following parameters were provided multiple times)
tuneLength = tuneLength # tuneLength (탐색할 후보 모수 갯수)
)
return(caret.xgb)
}
XGB.Caret <- XGBoost(Train.Data, 2)
XGB.Caret
eXtreme Gradient Boosting
123 samples
5 predictor
No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 99, 99, 97, 99, 98, 99, ...
Resampling results across tuning parameters:
eta max_depth gamma colsample_bytree min_child_weight
0.2228220 9 1.702621 0.6528662 3
0.4876292 3 5.465586 0.5499986 5
subsample nrounds RMSE Rsquared MAE Resamples
0.7517663 624 50.21422 0.8931412 38.60326 25
0.8219133 358 50.06911 0.8961369 38.41744 25
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 358, max_depth
= 3, eta = 0.4876292, gamma = 5.465586, colsample_bytree
= 0.5499986, min_child_weight = 5 and subsample = 0.8219133.
XGB.Caret$finalModel
##### xgb.Booster
raw: 181.9 Kb
call:
xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
eta = "0.48762916797353", max_depth = "3", gamma = "5.46558595029637", colsample_bytree = "0.549998590815812", min_child_weight = "5", subsample = "0.82191331172362", objective = "reg:linear", silent = "1"
xgb.attributes:
niter
callbacks:
cb.print.evaluation(period = print_every_n)
# of features: 5
niter: 358
nfeatures : 5
xNames : Month `S1-12` `C1-12` `S2-12` `C2-12`
problemType : Regression
tuneValue :
nrounds max_depth eta gamma colsample_bytree
2 358 3 0.4876292 5.465586 0.5499986
min_child_weight subsample
2 5 0.8219133
obsLevels : NA
param :
list()
XGB.Caret$finalModel$tuneValue
nrounds max_depth eta gamma colsample_bytree
2 358 3 0.4876292 5.465586 0.5499986
min_child_weight subsample
2 5 0.8219133
여러개의 서로 다른 머신 러닝 알고리즘로 구성
서로 다른 머신 러닝 알고리즘
을 이용하여 예측 결과를 생성한다.caretEnsemble
package의 caretList()
를 이용하여, Individual Model을 선언하고, caretStack()
으로 Final Model을 선언한다.# Ref. https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html
# https://github.com/zachmayer/caretEnsemble/blob/master/R/caretStack.R
# 1. Modeling for Stacking (Declare Individual Model)
set.seed(100)
fitControl <- trainControl(method = "repeatedcv", # adaptive_cv
number = 5,
repeats = 5,
# adaptive = list(min = 5,
# alpha = 0.05,
# method = "BT",
# complete = TRUE),
# search = "random", # grid
savePredictions = "final", # 최적 모수에 대한 예측 저장NA# classProbs = TRUE, # 각 클래스에 대한 확률 저장(Classification))
index = createResample(Train.Data$y, 1), # index : 각 resapling에 대한 요소, Training에 사용되는 행 번호/ createResample : 붓스트랩스트랩
allowParallel = TRUE)
# 원본 Training Data에 학습시킨 Hyperparameter 결과과
alg_tune_list <- list( # Do not use custom names in list. Will give prediction error with greedy ensemble. Bug in caret.
parRF = caretModelSpec(method="parRF",
importance = TRUE,
nodeside = 5,
tuneGrid = expand.grid(mtry=RF.Caret$finalModel$tuneValue$mtry)),
xgbTree = caretModelSpec(method="xgbTree",
tuneGrid = expand.grid(nrounds = XGB.Caret$finalModel$tuneValue$nrounds,
max_depth = XGB.Caret$finalModel$tuneValue$max_depth,
eta = XGB.Caret$finalModel$tuneValue$eta,
gamma = XGB.Caret$finalModel$tuneValue$gamma,
colsample_bytree = XGB.Caret$finalModel$tuneValue$colsample_bytree,
min_child_weight = XGB.Caret$finalModel$tuneValue$min_child_weight,
subsample = XGB.Caret$finalModel$tuneValue$subsample)))
set.seed(100)
multi_mod <- caretList(y~., data = Train.Data, trControl = fitControl,
tuneList = alg_tune_list) # search = "grid"
multi_mod$parRF
Parallel Random Forest
123 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 123
Resampling results:
RMSE Rsquared MAE
55.73908 0.8368916 42.4932
Tuning parameter 'mtry' was held constant at a value of 3
multi_mod$xgbTree
eXtreme Gradient Boosting
123 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 123
Resampling results:
RMSE Rsquared MAE
58.30165 0.827723 45.50672
Tuning parameter 'nrounds' was held constant at a value of 358
held constant at a value of 5
Tuning parameter 'subsample' was
held constant at a value of 0.8219133
# 2. Stacking (개별 모형들의 예측값을 결합한 Data를 Training data로 쓰는 Final Model))
set.seed(100)
stackControl <- trainControl(method = "adaptive_cv",
number = 5,
repeats = 5,
adaptive = list(min = 5,
alpha = 0.05,
method = "BT",
complete = TRUE),
search = "random",
allowParallel = TRUE)
set.seed(100)
stack.xgb <- caretStack(multi_mod, method = "xgbTree", # Final Model
trControl = stackControl,
tuneLength = 2) # 모수 후보 갯수수
stack.xgb
A xgbTree ensemble of 2 base models: parRF, xgbTree
Ensemble results:
eXtreme Gradient Boosting
38 samples
2 predictor
No pre-processing
Resampling: Adaptively Cross-Validated (5 fold, repeated 5 times)
Summary of sample sizes: 30, 31, 31, 30, 30, 30, ...
Resampling results across tuning parameters:
eta max_depth gamma colsample_bytree min_child_weight
0.2228220 9 1.702621 0.6528662 3
0.4876292 3 5.465586 0.5499986 5
subsample nrounds RMSE Rsquared MAE Resamples
0.7517663 624 48.62987 0.929678 38.75318 25
0.8219133 358 57.27670 0.924131 43.13093 25
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were nrounds = 624, max_depth
= 9, eta = 0.222822, gamma = 1.702621, colsample_bytree =
0.6528662, min_child_weight = 3 and subsample = 0.7517663.
stack.xgb$ens_model$finalModel
##### xgb.Booster
raw: 357.5 Kb
call:
xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
gamma = param$gamma, colsample_bytree = param$colsample_bytree,
min_child_weight = param$min_child_weight, subsample = param$subsample),
data = x, nrounds = param$nrounds, objective = "reg:linear")
params (as set within xgb.train):
eta = "0.222822001739405", max_depth = "9", gamma = "1.70262051047757", colsample_bytree = "0.652866207249463", min_child_weight = "3", subsample = "0.751766284287442", objective = "reg:linear", silent = "1"
xgb.attributes:
niter
callbacks:
cb.print.evaluation(period = print_every_n)
# of features: 2
niter: 624
nfeatures : 2
xNames : parRF xgbTree
problemType : Regression
tuneValue :
nrounds max_depth eta gamma colsample_bytree
1 624 9 0.222822 1.702621 0.6528662
min_child_weight subsample
1 3 0.7517663
obsLevels : NA
param :
list()
stack.xgb$ens_model$finalModel$tuneValue
nrounds max_depth eta gamma colsample_bytree
1 624 9 0.222822 1.702621 0.6528662
min_child_weight subsample
1 3 0.7517663
caretEnsemble
package의 caretEnsemble()
를 이용하면 Final Model이 GLM
이며, 여기의 R code를 확인하면 된다.\[ \begin{aligned} \hat{Y} = \beta_{0} + \sum^{m}_{i=1} \hat{y}_{i} \end{aligned} \]
set.seed(100)
greedyEnsemble <- caretEnsemble(multi_mod, trControl = trainControl(method = "cv", number=5))
greedyEnsemble
A glm ensemble of 2 base models: parRF, xgbTree
Ensemble results:
Generalized Linear Model
38 samples
2 predictor
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 30, 31, 31, 30, 30
Resampling results:
RMSE Rsquared MAE
58.25038 0.8224663 45.03724
greedyEnsemble$ens_model$finalModel
Call: NULL
Coefficients:
(Intercept) parRF xgbTree
-8.46746 0.93423 0.02958
Degrees of Freedom: 37 Total (i.e. Null); 35 Residual
Null Deviance: 703200
Residual Deviance: 114700 AIC: 420.3
summary(greedyEnsemble)
The following models were ensembled: parRF, xgbTree
They were weighted:
-8.4675 0.9342 0.0296
The resulting RMSE is: 58.2504
The fit for each individual model on the RMSE is:
method RMSE RMSESD
parRF 55.73908 NA
xgbTree 58.30165 NA
VI <- varImp(greedyEnsemble)
varImp(greedyEnsemble)
overall parRF xgbTree
`C2-12` 0.00000 0.00000 0.00000
`S1-12` 11.09090 10.66833 24.43856
`C1-12` 13.65903 13.66554 13.45348
`S2-12` 20.28970 20.52867 12.74124
Month 54.96037 55.13746 49.36673
VI$x <- as.character(row.names(VI))
VI2 <- reshape2::melt(VI, id.vars="x",
variable.name="Type",
value.name='VI')
ggplot(VI2, aes(x=reorder(x, VI), y=VI)) +
geom_bar(stat="identity", position=position_dodge(), show.legend = F) +
facet_wrap(.~ Type, nrow=1) +
xlab("") +
ylab("") +
coord_flip() +
theme_bw() +
theme(axis.text.x = element_text(size = 13), # angle = 45, vjust = 1, hjust = 1
axis.text.y = element_text(size = 13),
axis.title = element_blank(),
plot.title = element_blank(),
legend.title = element_blank(),
strip.text.x = element_text(size=15, color="black"))
# Trend
trend.part <- data.frame(decomp.ts)$trend %>% # Only Trend of Training Data
ts()
# Fitting ARIMA for Trend
trend.fit.arima <- auto.arima(trend.part)
# Forecast
trend.arima.pred <- forecast(trend.fit.arima, n.test)
trend.arima.pred$mean
Time Series:
Start = 124
End = 159
Frequency = 1
[1] 1963.065 1966.790 1970.428 1974.067 1977.679 1981.291 1984.895
[8] 1988.498 1992.099 1995.700 1999.301 2002.901 2006.501 2010.101
[15] 2013.701 2017.301 2020.901 2024.501 2028.101 2031.701 2035.301
[22] 2038.901 2042.501 2046.101 2049.701 2053.301 2056.901 2060.501
[29] 2064.101 2067.701 2071.301 2074.901 2078.501 2082.101 2085.701
[36] 2089.301
# 최종 예측(Seasonal + Remainder + Trend)Pred.RF <- predict(RF.Caret, Test.X) + trend.arima.pred$mean
Pred.XGB <- predict(XGB.Caret, Test.X) + trend.arima.pred$mean
stack.Pred.XGB <- predict(stack.xgb, Test.X) + trend.arima.pred$mean
stack.Pred.GLM <- predict(greedyEnsemble, Test.X) + trend.arima.pred$mean
# Accuracy
acc_RF <- accuracy(c(Pred.RF), test.ts)
acc_XGB <- accuracy(c(Pred.XGB), test.ts)
acc_stack.XGB <- accuracy(c(stack.Pred.XGB), test.ts)
acc_stack.GLM <- accuracy(c(stack.Pred.GLM), test.ts)
acc_RF
ME RMSE MAE MPE MAPE ACF1
Test set -29.23913 71.90871 46.82079 -1.546831 2.423727 0.6008866
Theil's U
Test set 0.4190109
acc_XGB
ME RMSE MAE MPE MAPE ACF1
Test set -27.27012 68.20048 47.95543 -1.475506 2.477208 0.5023489
Theil's U
Test set 0.3950394
acc_stack.XGB
ME RMSE MAE MPE MAPE ACF1
Test set -29.89315 88.01831 65.54621 -1.617441 3.344385 0.3068341
Theil's U
Test set 0.512367
acc_stack.GLM
ME RMSE MAE MPE MAPE ACF1
Test set -19.94278 66.9187 44.73519 -1.09601 2.30595 0.5677311
Theil's U
Test set 0.388657
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".