Cross Validation and Out-of-Bag

Machine Learning

Cross Validation and Out-of-Bag

Yeongeun Jeon , Jeongwook Lee , Jung In Seo
09-22-2020

예제로 사용될 데이터는 R에 내장되어 있는 “SPAM” 데이터이다. 미국 캘리포니아 Pal Alto 지역에서 우체국, 개개인으로 부터 spam mail 수집하였는데, 여기서 spam mial은 수신자의 의사와 상관없이 전송되는 불필요한 광고성 e-mail이다. e-mail에 포함된 단어, 특수문자, 대문자 빈도로 spam mail 분류하였으며, 총 4601개의 관측치와 58개의 변수 데이터로 이루어져 있다.


1. 데이터 불러오기

pacman::p_load("caret",           # For train
               "kernlab",         # For spam data
               "dplyr",           # For glimpse
               "microbenchmark",  # For microbenchmark 
               "ggplot2"          # For ggplot
               )



data(spam)                        # Load Data

glimpse(spam)                     # Structure of data
Rows: 4,601
Columns: 58
$ make              <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ address           <dbl> 0.64, 0.28, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ all               <dbl> 0.64, 0.50, 0.71, 0.00, 0.00, 0.00, 0.00, ~
$ num3d             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ our               <dbl> 0.32, 0.14, 1.23, 0.63, 0.63, 1.85, 1.92, ~
$ over              <dbl> 0.00, 0.28, 0.19, 0.00, 0.00, 0.00, 0.00, ~
$ remove            <dbl> 0.00, 0.21, 0.19, 0.31, 0.31, 0.00, 0.00, ~
$ internet          <dbl> 0.00, 0.07, 0.12, 0.63, 0.63, 1.85, 0.00, ~
$ order             <dbl> 0.00, 0.00, 0.64, 0.31, 0.31, 0.00, 0.00, ~
$ mail              <dbl> 0.00, 0.94, 0.25, 0.63, 0.63, 0.00, 0.64, ~
$ receive           <dbl> 0.00, 0.21, 0.38, 0.31, 0.31, 0.00, 0.96, ~
$ will              <dbl> 0.64, 0.79, 0.45, 0.31, 0.31, 0.00, 1.28, ~
$ people            <dbl> 0.00, 0.65, 0.12, 0.31, 0.31, 0.00, 0.00, ~
$ report            <dbl> 0.00, 0.21, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ addresses         <dbl> 0.00, 0.14, 1.75, 0.00, 0.00, 0.00, 0.00, ~
$ free              <dbl> 0.32, 0.14, 0.06, 0.31, 0.31, 0.00, 0.96, ~
$ business          <dbl> 0.00, 0.07, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ email             <dbl> 1.29, 0.28, 1.03, 0.00, 0.00, 0.00, 0.32, ~
$ you               <dbl> 1.93, 3.47, 1.36, 3.18, 3.18, 0.00, 3.85, ~
$ credit            <dbl> 0.00, 0.00, 0.32, 0.00, 0.00, 0.00, 0.00, ~
$ your              <dbl> 0.96, 1.59, 0.51, 0.31, 0.31, 0.00, 0.64, ~
$ font              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num000            <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00, ~
$ money             <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ hp                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ hpl               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ george            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num650            <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ lab               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ labs              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ telnet            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num857            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ data              <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ num415            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num85             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ technology        <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ num1999           <dbl> 0.00, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ parts             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ pm                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ direct            <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ cs                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ meeting           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ original          <dbl> 0.00, 0.00, 0.12, 0.00, 0.00, 0.00, 0.00, ~
$ project           <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ re                <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ edu               <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ table             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ conference        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ charSemicolon     <dbl> 0.000, 0.000, 0.010, 0.000, 0.000, 0.000, ~
$ charRoundbracket  <dbl> 0.000, 0.132, 0.143, 0.137, 0.135, 0.223, ~
$ charSquarebracket <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ~
$ charExclamation   <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000, ~
$ charDollar        <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000, ~
$ charHash          <dbl> 0.000, 0.048, 0.010, 0.000, 0.000, 0.000, ~
$ capitalAve        <dbl> 3.756, 5.114, 9.821, 3.537, 3.537, 3.000, ~
$ capitalLong       <dbl> 61, 101, 485, 40, 40, 15, 4, 11, 445, 43, ~
$ capitalTotal      <dbl> 278, 1028, 2259, 191, 191, 54, 112, 49, 12~
$ type              <fct> spam, spam, spam, spam, spam, spam, spam, ~

2. 데이터 분할

set.seed(1235)
DATA <- createDataPartition(y=spam$type, p=0.75, list=FALSE)   # Training Data : 75%
TrD  <- spam[DATA,]                                            # Training Data
TeD  <- spam[-DATA,]                                           # Test Data

prop.table(table(TrD$type))                                    # Proportion Class of Training Data 

  nonspam      spam 
0.6059113 0.3940887 
prop.table(table(TeD$type))                                    # Proportion Class of Test Data 

 nonspam     spam 
0.606087 0.393913 

3. 분할 기법 비교

ctrl1 <- trainControl(method="LOOCV")                              # Leave-One-Out Cross Validaion
system.time( modFit_loocv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl1) )
   user  system elapsed 
1243.83    5.08 1261.38 
modFit_loocv
Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Leave-One-Out Cross-Validation 
Summary of sample sizes: 3450, 3450, 3450, 3450, 3450, 3450, ... 
Resampling results:

  Accuracy   Kappa    
  0.9223414  0.8365883
ctrl2 <- trainControl(method = "cv", number = 10)                  # 10-Fold Cross Validaion
system.time( modFit_cv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl2) )
   user  system elapsed 
   3.64    0.01    3.67 
modFit_cv
Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 3106, 3106, 3106, 3105, 3106, 3106, ... 
Resampling results:

  Accuracy   Kappa    
  0.9237874  0.8396288
ctrl3 <- trainControl(method="repeatedcv", number=10, repeats=5)   # 10-Fold Cross Validaion을 5번 반복system.time( modFit_repeatedcv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl3) )
   user  system elapsed 
  16.33    0.31   16.84 
modFit_repeatedcv
Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 3106, 3106, 3106, 3105, 3106, 3106, ... 
Resampling results:

  Accuracy   Kappa    
  0.9234401  0.8390102
ctrl4 <- trainControl(method="boot", number=10)                    # 10번 반복 붓스트랩스트랩
system.time( modFit_boot <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl4) )
   user  system elapsed 
   4.80    0.04    4.97 
modFit_boot
Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Bootstrapped (10 reps) 
Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
Resampling results:

  Accuracy   Kappa    
  0.9189991  0.8303191

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".