Package tidymodels (Ver 0.2.0)는 R에서 머신러닝(Machine Learning)을 tidyverse principle로 수행할 수 있게끔 해주는 패키지 묶음이다. 특히, 모델링에 필요한 필수 패키지들을 대부분 포함하고 있기 때문에 데이터 전처리부터 시각화, 모델링, 예측까지 모든 과정을 tidy framework로 진행할 수 있다. 또한, Package caret을 완벽하게 대체하며 보다 더 빠르고 직관적인 코드로 모델링을 수행할 수 있다.
Package tidymodels를 이용하여 여러 모형을 구축하고 비교하는 방법을 설명하기 위해 “Heart Disease Prediction” 데이터를 예제로 사용한다. 이 데이터는 환자의 심장병을 예측하기 위해 총 918명의 환자에 대한 10개의 예측변수로 이루어진 데이터이다(출처 : Package MLDataR, Gary Hutson 2021). 여기서 Target은 HeartDisease이다.

0. Schematic Diagram

1. 데이터 불러오기

# install.packages("tidymodels")
pacman::p_load("MLDataR",                                              # For Data
               "data.table", "magrittr",
               "tidymodels",
               "doParallel", "parallel")

registerDoParallel(cores=detectCores())


data(heartdisease)
data <- heartdisease %>%
  mutate(HeartDisease = ifelse(HeartDisease==0, "no", "yes"))


cols <- c("Sex", "RestingECG", "Angina", "HeartDisease")

data   <- data %>% 
  mutate_at(cols, as.factor)                                           # 범주형 변수 변환

glimpse(data)                                                          # 데이터 구조

Rows: 918
Columns: 10
$ Age              <dbl> 40, 49, 37, 48, 54, 39, 45, 54, 37, 48, 37,~
$ Sex              <fct> M, F, M, F, M, M, F, M, M, F, F, M, M, M, F~
$ RestingBP        <dbl> 140, 160, 130, 138, 150, 120, 130, 110, 140~
$ Cholesterol      <dbl> 289, 180, 283, 214, 195, 339, 237, 208, 207~
$ FastingBS        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ RestingECG       <fct> Normal, Normal, ST, Normal, Normal, Normal,~
$ MaxHR            <dbl> 172, 156, 98, 108, 122, 170, 170, 142, 130,~
$ Angina           <fct> N, N, N, Y, N, N, N, N, Y, N, N, Y, N, Y, N~
$ HeartPeakReading <dbl> 0.0, 1.0, 0.0, 1.5, 0.0, 0.0, 0.0, 0.0, 1.5~
$ HeartDisease     <fct> no, yes, no, yes, no, no, no, no, yes, no, ~

2. 데이터 분할

set.seed(100)                                                          # seed 고정
data.split <- initial_split(data, prop = 0.7, strata = HeartDisease)   # Partition (Traning Data : Test Data = 7:3)/ initial_split(, strata = 층화추출할 변수)NAHD.train   <- training(data.split)
HD.test    <- testing(data.split)

3. 모형 적합

예제 데이터에 Support Vector Machine, Random Forest, XGBoost를 적용하고자 한다.
먼저, Workflow를 사용하기 위해 전처리를 정의한다.

3-1. 전처리 정의

rec  <- recipe(HeartDisease ~ ., data = HD.train) %>%                  # recipe(formula, data)
  step_normalize(all_numeric_predictors()) %>%                         # 모든 수치형 예측변수들을 표준화  step_dummy(all_nominal_predictors(), one_hot = TRUE)                 # 모든 범주형 예측변수들에 대해 원-핫 인코딩 더미변수 생성NA

3-2. 모형 정의

3-2-1. Support Vector Machine

# Support Vector Machine (Kernel : Radial Basis Function)
svm.rbf.mod <- svm_rbf(cost      = tune(),                             # cost : 데이터를 잘못 분류하는 선을 긋게 될 경우 지불해야 할 costNA= tune()) %>%                         # rbf_sigma : Precision 모수(gamma = 1/2*sigma^2)
  set_mode("classification") %>%                                       # Target 유형 정의(classification /  regression)NAset_engine("kernlab")                                                # 사용하고자하는 패키지 정의(kernlab /  liquidSVM)NA# 실제 패키지에 어떻게 적용되는지 확인NAsvm.rbf.mod %>% 
  translate()

Radial Basis Function Support Vector Machine Specification (classification)

Main Arguments:
  cost = tune()
  rbf_sigma = tune()

Computational engine: kernlab 

Model fit template:
kernlab::ksvm(x = missing_arg(), data = missing_arg(), C = tune(), 
    kernel = "rbfdot", prob.model = TRUE, kpar = list(sigma = ~tune()))

3-2-2. Random Forest

# Random Forest
rf.mod <- rand_forest(mtry  = tune(),                                  # mtry : 노드를 분할할 때 랜덤하게 선택되는 후보 예측변수 개수개수
                      trees = tune(),                                  # trees : 생성하고자 하는 트리의 개수                       min_n = tune()) %>%                              # min_n(nodesize) : 터미널 노드의 최소 개수NAset_mode("classification") %>%                                       # Target 유형 정의(classification /  regression)NAset_engine("randomForest" ,                                          # 사용하고자하는 패키지 정의(randomForest / ranger / spark)NA= TRUE)

# 실제 패키지에 어떻게 적용되는지 확인NArf.mod %>%
  translate()

Random Forest Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = tune()
  min_n = tune()

Engine-Specific Arguments:
  importance = TRUE

Computational engine: randomForest 

Model fit template:
randomForest::randomForest(x = missing_arg(), y = missing_arg(), 
    mtry = min_cols(~tune(), x), ntree = tune(), nodesize = min_rows(~tune(), 
        x), importance = TRUE)

3-2-3. XGBoost

# XGBoost
xgb.mod <- boost_tree(mtry           = tune(),                         # mtry(colsample_bynode) : 노드를 분할할 때 랜덤하게 선택되는 예측변수의 개수       
                      trees          = tune(),                         # trees(nrounds) : 생성하고자 하는 트리의 개수                       tree_depth     = tune(),                         # tree_depth(max_depth) : 생성된 트리의 최대 깊이                      learn_rate     = tune(),                         # learn_rate(eta) : 학습률                      min_n          = tune(),                         # min_n(min_child_weight) : 노드를 분할하기 위해 필요한 최소 가중치 합 NA= tune(),                         # loss_reduction(gamma) : 노드 분할을 위한 최소 손실 감소값                       sample_size    = tune(),                         # sample_size(subsample) : 각 부스팅 단계에서 사용되는 훈련 데이터의 비율
                      stop_iter      = tune()) %>%                     # stop_iter(early_stop) : 조기종료를 위한 부스팅 횟수  
  set_mode("classification") %>%                                       # Target 유형 정의(classification /  regression)NAset_engine("xgboost")                                                # 사용하고자하는 패키지 정의                                       NA# 실제 패키지에 어떻게 적용되는지 확인NAxgb.mod %>% 
  translate()

Boosted Tree Model Specification (classification)

Main Arguments:
  mtry = tune()
  trees = tune()
  min_n = tune()
  tree_depth = tune()
  learn_rate = tune()
  loss_reduction = tune()
  sample_size = tune()
  stop_iter = tune()

Computational engine: xgboost 

Model fit template:
parsnip::xgb_train(x = missing_arg(), y = missing_arg(), colsample_bynode = tune(), 
    nrounds = tune(), min_child_weight = tune(), max_depth = tune(), 
    eta = tune(), gamma = tune(), subsample = tune(), early_stop = tune(), 
    nthread = 1, verbose = 0)

3-3. Workflow Set

함수 workflow_set()를 통해 전처리와 모형 조합에 대한 Workflow를 쉽게 만들 수 있다.

ml.models <- workflow_set(preproc = list(basic = rec),                 # Recipe로 정의한 전처리를 list 형태로 저장NA= list(svm  = svm.rbf.mod,           # 앞에서 정의한 모형을 list 형태로 저장NA= rf.mod,  
                                         xgb  = xgb.mod),   
                          cross = TRUE)                                # Combination (전처리, 정의한 모형)NAml.models

# A workflow set/tibble: 3 x 4
  wflow_id  info             option    result    
  <chr>     <list>           <list>    <list>    
1 basic_svm <tibble [1 x 4]> <opts[0]> <list [0]>
2 basic_rf  <tibble [1 x 4]> <opts[0]> <list [0]>
3 basic_xgb <tibble [1 x 4]> <opts[0]> <list [0]>

Result! 총 3개의 Workflow가 생성되었다.

3-4. 모수 범위 업데이트

Random Forest와 XGBoost의 mtry는 상한이 ?로 설정되어 있으므로 함수 update()를 이용하여 범위 수정을 해야 한다.

# 전처리가 적용된 데이터의 예측변수 개수가 상한이 되도록 설정정
rf.param <- extract_parameter_set_dials(rf.mod) %>%                          
  update(mtry =  mtry(c(1, 
                        ncol(select(juice(prep(rec)), -HeartDisease))  # juice(prep(rec)) : Recipe 적용 -> 전처리가 적용된 데이터셋 생성, ncol(select(., -Target)) : 전처리가 적용된 데이터의 예측변수 개수  ))) 

# rf.param %>%
#   extract_parameter_dials("mtry")

xgb.param <- extract_parameter_set_dials(xgb.mod) %>%                          
  update(mtry =  mtry(c(1, 
                        ncol(select(juice(prep(rec)), -HeartDisease))  # juice(prep(rec)) : Recipe 적용 -> 전처리가 적용된 데이터셋 생성, ncol(select(., -Target)) : 전처리가 적용된 데이터의 예측변수 개수  ))) 

# xgb.param %>%
#   extract_parameter_dials("mtry")

# RF와 XGBoost에 모수의 범위에 대한 옵션을 추가함ml.models %<>%
  option_add(param_info = rf.param, id  = "basic_rf") %>%
  option_add(param_info = xgb.param, id = "basic_xgb")

ml.models

# A workflow set/tibble: 3 x 4
  wflow_id  info             option    result    
  <chr>     <list>           <list>    <list>    
1 basic_svm <tibble [1 x 4]> <opts[0]> <list [0]>
2 basic_rf  <tibble [1 x 4]> <opts[1]> <list [0]>
3 basic_xgb <tibble [1 x 4]> <opts[1]> <list [0]>

Result! “basic_rf”와 “basic_xgb”의 option열이 opts[1]로 바뀌었다.

3-5. 모수 튜닝

함수 workflow_map()는 각 workflosw에서 동일하게 수행할 함수이다.
- 예를 들어, 모수 튜닝을 수행하기 위해 위에서 생성된 3개의 workflow에 함수 tune_grid()이 동일하게 적용되며, 이때 함수 workflow_map()을 이용한다.

set.seed(100)
ml.models.tune <- ml.models %>%
  workflow_map("tune_grid",                                                 # 위에서 정의된 각 workflow마다 함수 tune_grid 적용 //
               seed = 100, verbose = TRUE,                                  # Options to workflow_map()
               # Options to tune_grid() -> tune_grid()에 대한 인자들 지정NA= 10,                                                   # 랜덤하게 생성되는 후보 모수 집합 개수 
               resamples = vfold_cv(HD.train, v = 5),                       # 5-Fold Cross-Validation
               control   = control_grid(save_pred = TRUE,                   # Resampling의 Assessment 결과 저장NA= "everything",       # 병렬 처리(http:://tune.tidymodels.org/reference/control_grid.html) ) 
                                        save_workflow = TRUE),              # workflow가 속성의 형태로 출력에 포함NA= metric_set(roc_auc, accuracy))                     # Assessment 그룹에 대한 Assessment Measure
ml.models.tune

# A workflow set/tibble: 3 x 4
  wflow_id  info             option    result   
  <chr>     <list>           <list>    <list>   
1 basic_svm <tibble [1 x 4]> <opts[4]> <tune[+]>
2 basic_rf  <tibble [1 x 4]> <opts[5]> <tune[+]>
3 basic_xgb <tibble [1 x 4]> <opts[5]> <tune[+]>

4. 결과

# Assessment Measure
collect_metrics(ml.models.tune)

# A tibble: 60 x 9
   wflow_id  .config      preproc model .metric .estimator  mean     n
   <chr>     <fct>        <chr>   <chr> <chr>   <chr>      <dbl> <int>
 1 basic_svm Preprocesso~ recipe  svm_~ accura~ binary     0.553     5
 2 basic_svm Preprocesso~ recipe  svm_~ roc_auc binary     0.870     5
 3 basic_svm Preprocesso~ recipe  svm_~ accura~ binary     0.601     5
 4 basic_svm Preprocesso~ recipe  svm_~ roc_auc binary     0.870     5
 5 basic_svm Preprocesso~ recipe  svm_~ accura~ binary     0.553     5
 6 basic_svm Preprocesso~ recipe  svm_~ roc_auc binary     0.869     5
 7 basic_svm Preprocesso~ recipe  svm_~ accura~ binary     0.553     5
 8 basic_svm Preprocesso~ recipe  svm_~ roc_auc binary     0.869     5
 9 basic_svm Preprocesso~ recipe  svm_~ accura~ binary     0.553     5
10 basic_svm Preprocesso~ recipe  svm_~ roc_auc binary     0.870     5
# ... with 50 more rows, and 1 more variable: std_err <dbl>

Result! 총 30(Support Vector Machine : 10, Random Forest : 10, XGBoost : 10)개의 모형들에 대한 평가 척도값들을 나타낸다.

autoplot(
  ml.models.tune,
  rank_metric = "accuracy",                                             # 순위를 정렬할 Metric
  metric = "accuracy",                                                  # 어떤 Metric을 그래프로 나타낼 것인지지
  select_best = TRUE                                                    # 각 workflow에서 최적의 모수 조합들만 그래프로 나타낼 것인지 
) +
  geom_text(aes(label = wflow_id)) +
  theme_bw() +
  theme(legend.position = "none")

Result! Random Forest가 “Accuracy” 측면에서 가장 우수한 성능을 보여준다.

Caution! 함수 last_fit()은 최적의 모수 조합에 대해 Training Data를 이용한 모형 적합과 Test Data에 대한 예측을 한 번에 수행할 수 있지만 seed 고정이 되지 않아 Reproducibility (재생산성)가 만족되지 않는다. 따라서, 모형 적합(함수 fit())과 예측(함수 augment())을 각각 수행하였다.

4-1. Support Vector Machine 결과

# 최적의 모수 조합NAsvm.tune.result <- extract_workflow_set_result(ml.models.tune, id = "basic_svm") %>%
  select_best(metric = "accuracy")                                     # select_best(metric = "roc_auc")
svm.tune.result

# A tibble: 1 x 3
   cost rbf_sigma .config              
  <dbl>     <dbl> <fct>                
1  2.95   0.00460 Preprocessor1_Model06

Result! cost = 2.95, rbf_sigma = 0.00460일 때 “Accuracy” 측면에서 가장 우수한 성능을 보여준다.

# 최적의 모수 조합을 이용한 모형 적합NAset.seed(100)
svm.result <- ml.models.tune %>%
  extract_workflow("basic_svm") %>%
  finalize_workflow(svm.tune.result) %>%
  fit(data = HD.train)

# 최종 모형NAsvm.result %>%
  extract_fit_engine()

Support Vector Machine object of class "ksvm" 

SV type: C-svc  (classification) 
 parameter : cost C = 2.95390218341127 

Gaussian Radial Basis kernel function. 
 Hyperparameter : sigma =  0.00459764397919072 

Number of Support Vectors : 320 

Objective Function Value : -875.3868 
Training error : 0.193146 
Probability model included.

# 예측
augment(svm.result, HD.test)

# A tibble: 276 x 13
     Age Sex   RestingBP Cholesterol FastingBS RestingECG MaxHR Angina
   <dbl> <fct>     <dbl>       <dbl>     <dbl> <fct>      <dbl> <fct> 
 1    54 M           110         208         0 Normal       142 N     
 2    37 M           140         207         0 Normal       130 Y     
 3    37 F           130         211         0 Normal       142 N     
 4    39 M           120         204         0 Normal       145 N     
 5    49 M           140         234         0 Normal       140 Y     
 6    42 F           115         211         0 ST           137 N     
 7    60 M           100         248         0 Normal       125 N     
 8    36 M           120         267         0 Normal       160 N     
 9    43 F           100         223         0 Normal       142 N     
10    36 M           130         209         0 Normal       178 N     
# ... with 266 more rows, and 5 more variables:
#   HeartPeakReading <dbl>, HeartDisease <fct>, .pred_class <fct>,
#   .pred_no <dbl>, .pred_yes <dbl>

4-2. Random Forest 결과

# 최적의 모수 조합NArf.tune.result <- extract_workflow_set_result(ml.models.tune, id = "basic_rf") %>%
  select_best(metric = "accuracy")                                     # select_best(metric = "roc_auc")
rf.tune.result

# A tibble: 1 x 4
   mtry trees min_n .config              
  <int> <int> <int> <fct>                
1    13  1372     9 Preprocessor1_Model04

Result! mtry = 13, trees = 1372, min_n = 9일 때 “Accuracy” 측면에서 가장 우수한 성능을 보여준다.

# 최적의 모수 조합을 이용한 모형 적합NAset.seed(100)
rf.result <- ml.models.tune %>%
  extract_workflow("basic_rf") %>%
  finalize_workflow(rf.tune.result) %>%
  fit(data = HD.train)

# 최종 모형NArf.result %>%
  extract_fit_engine()


Call:
 randomForest(x = maybe_data_frame(x), y = y, ntree = ~1372L,      mtry = min_cols(~13L, x), nodesize = min_rows(~9L, x), importance = ~TRUE) 
               Type of random forest: classification
                     Number of trees: 1372
No. of variables tried at each split: 13

        OOB estimate of  error rate: 17.76%
Confusion matrix:
     no yes class.error
no  231  56   0.1951220
yes  58 297   0.1633803

# 예측
augment(rf.result, HD.test)

# A tibble: 276 x 13
     Age Sex   RestingBP Cholesterol FastingBS RestingECG MaxHR Angina
   <dbl> <fct>     <dbl>       <dbl>     <dbl> <fct>      <dbl> <fct> 
 1    54 M           110         208         0 Normal       142 N     
 2    37 M           140         207         0 Normal       130 Y     
 3    37 F           130         211         0 Normal       142 N     
 4    39 M           120         204         0 Normal       145 N     
 5    49 M           140         234         0 Normal       140 Y     
 6    42 F           115         211         0 ST           137 N     
 7    60 M           100         248         0 Normal       125 N     
 8    36 M           120         267         0 Normal       160 N     
 9    43 F           100         223         0 Normal       142 N     
10    36 M           130         209         0 Normal       178 N     
# ... with 266 more rows, and 5 more variables:
#   HeartPeakReading <dbl>, HeartDisease <fct>, .pred_class <fct>,
#   .pred_no <dbl>, .pred_yes <dbl>

4-3. XGBoost 결과

# 최적의 모수 조합NAxgb.tune.result <- extract_workflow_set_result(ml.models.tune, id = "basic_xgb") %>%
  select_best(metric = "accuracy")                                     # select_best(metric = "roc_auc")
xgb.tune.result

# A tibble: 1 x 9
   mtry trees min_n tree_depth learn_rate loss_reduction sample_size
  <int> <int> <int>      <int>      <dbl>          <dbl>       <dbl>
1    12   881     2          9    0.00250           1.88       0.628
# ... with 2 more variables: stop_iter <int>, .config <fct>

Result! mtry = 12, trees = 881,min_n = 2, tree_depth = 9, learn_rate = 0.0025, loss_reduction = 1.88, sample_size = 0.628, stop_iter = 10일 때 “Accuracy” 측면에서 가장 우수한 성능을 보여준다.

# 최적의 모수 조합을 이용한 모형 적합NAset.seed(100)
xgb.result <- ml.models.tune %>%
  extract_workflow("basic_xgb") %>%
  finalize_workflow(xgb.tune.result) %>%
  fit(data = HD.train)

# 최종 모형NAxgb.result %>%
  extract_fit_engine()

##### xgb.Booster
raw: 20.1 Kb 
call:
  xgboost::xgb.train(params = list(eta = 0.00250467484632837, max_depth = 9L, 
    gamma = 1.87588926862566, colsample_bytree = 1, colsample_bynode = 0.923076923076923, 
    min_child_weight = 2L, subsample = 0.628499997311737, objective = "binary:logistic"), 
    data = x$data, nrounds = 881L, watchlist = x$watchlist, verbose = 0, 
    early_stopping_rounds = 10L, nthread = 1)
params (as set within xgb.train):
  eta = "0.00250467484632837", max_depth = "9", gamma = "1.87588926862566", colsample_bytree = "1", colsample_bynode = "0.923076923076923", min_child_weight = "2", subsample = "0.628499997311737", objective = "binary:logistic", nthread = "1", silent = "1"
xgb.attributes:
  best_iteration, best_msg, best_ntreelimit, best_score, niter
callbacks:
  cb.evaluation.log()
  cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
    verbose = verbose)
# of features: 13 
niter: 15
best_iteration : 5 
best_ntreelimit : 5 
best_score : 0.138629 
nfeatures : 13 
evaluation_log:
    iter training_error
       1       0.190031
       2       0.177570
---                    
      14       0.152648
      15       0.154206

# 예측
augment(xgb.result, HD.test)

# A tibble: 276 x 13
     Age Sex   RestingBP Cholesterol FastingBS RestingECG MaxHR Angina
   <dbl> <fct>     <dbl>       <dbl>     <dbl> <fct>      <dbl> <fct> 
 1    54 M           110         208         0 Normal       142 N     
 2    37 M           140         207         0 Normal       130 Y     
 3    37 F           130         211         0 Normal       142 N     
 4    39 M           120         204         0 Normal       145 N     
 5    49 M           140         234         0 Normal       140 Y     
 6    42 F           115         211         0 ST           137 N     
 7    60 M           100         248         0 Normal       125 N     
 8    36 M           120         267         0 Normal       160 N     
 9    43 F           100         223         0 Normal       142 N     
10    36 M           130         209         0 Normal       178 N     
# ... with 266 more rows, and 5 more variables:
#   HeartPeakReading <dbl>, HeartDisease <fct>, .pred_class <fct>,
#   .pred_no <dbl>, .pred_yes <dbl>

Comparison Various Models based on Tidymodels