예제로 사용될 데이터는 R에 내장되어 있는 “SPAM” 데이터이다. 미국 캘리포니아 Pal Alto 지역에서 우체국, 개개인으로 부터 spam mail 수집하였는데, 여기서 spam mial은 수신자의 의사와 상관없이 전송되는 불필요한 광고성 e-mail이다. e-mail에 포함된 단어, 특수문자, 대문자 빈도로 spam mail 분류하였으며, 총 4601개의 관측치와 58개의 변수 데이터로 이루어져 있다.

1. 데이터 불러오기

pacman::p_load("caret",           # For train
               "kernlab",         # For spam data
               "dplyr",           # For glimpse
               "microbenchmark",  # For microbenchmark 
               "ggplot2"          # For ggplot
               )



data(spam)                        # Load Data

glimpse(spam)                     # Structure of data

Rows: 4,601
Columns: 58
$ make              <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ address           <dbl> 0.64, 0.28, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ all               <dbl> 0.64, 0.50, 0.71, 0.00, 0.00, 0.00, 0.00, ~
$ num3d             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ our               <dbl> 0.32, 0.14, 1.23, 0.63, 0.63, 1.85, 1.92, ~
$ over              <dbl> 0.00, 0.28, 0.19, 0.00, 0.00, 0.00, 0.00, ~
$ remove            <dbl> 0.00, 0.21, 0.19, 0.31, 0.31, 0.00, 0.00, ~
$ internet          <dbl> 0.00, 0.07, 0.12, 0.63, 0.63, 1.85, 0.00, ~
$ order             <dbl> 0.00, 0.00, 0.64, 0.31, 0.31, 0.00, 0.00, ~
$ mail              <dbl> 0.00, 0.94, 0.25, 0.63, 0.63, 0.00, 0.64, ~
$ receive           <dbl> 0.00, 0.21, 0.38, 0.31, 0.31, 0.00, 0.96, ~
$ will              <dbl> 0.64, 0.79, 0.45, 0.31, 0.31, 0.00, 1.28, ~
$ people            <dbl> 0.00, 0.65, 0.12, 0.31, 0.31, 0.00, 0.00, ~
$ report            <dbl> 0.00, 0.21, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ addresses         <dbl> 0.00, 0.14, 1.75, 0.00, 0.00, 0.00, 0.00, ~
$ free              <dbl> 0.32, 0.14, 0.06, 0.31, 0.31, 0.00, 0.96, ~
$ business          <dbl> 0.00, 0.07, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ email             <dbl> 1.29, 0.28, 1.03, 0.00, 0.00, 0.00, 0.32, ~
$ you               <dbl> 1.93, 3.47, 1.36, 3.18, 3.18, 0.00, 3.85, ~
$ credit            <dbl> 0.00, 0.00, 0.32, 0.00, 0.00, 0.00, 0.00, ~
$ your              <dbl> 0.96, 1.59, 0.51, 0.31, 0.31, 0.00, 0.64, ~
$ font              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num000            <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00, ~
$ money             <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ hp                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ hpl               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ george            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num650            <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ lab               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ labs              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ telnet            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num857            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ data              <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ num415            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num85             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ technology        <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ num1999           <dbl> 0.00, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ parts             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ pm                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ direct            <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ cs                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ meeting           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ original          <dbl> 0.00, 0.00, 0.12, 0.00, 0.00, 0.00, 0.00, ~
$ project           <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ re                <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ edu               <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ table             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ conference        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ charSemicolon     <dbl> 0.000, 0.000, 0.010, 0.000, 0.000, 0.000, ~
$ charRoundbracket  <dbl> 0.000, 0.132, 0.143, 0.137, 0.135, 0.223, ~
$ charSquarebracket <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ~
$ charExclamation   <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000, ~
$ charDollar        <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000, ~
$ charHash          <dbl> 0.000, 0.048, 0.010, 0.000, 0.000, 0.000, ~
$ capitalAve        <dbl> 3.756, 5.114, 9.821, 3.537, 3.537, 3.000, ~
$ capitalLong       <dbl> 61, 101, 485, 40, 40, 15, 4, 11, 445, 43, ~
$ capitalTotal      <dbl> 278, 1028, 2259, 191, 191, 54, 112, 49, 12~
$ type              <fct> spam, spam, spam, spam, spam, spam, spam, ~

1 ~ 48열 : 이메일에 포함된 단어의 비율
49 ~ 54열 : ‘;’, ‘(‘, ‘)’, ‘!’, ‘$’, ‘#’ 의 비율
55 ~ 57열 : 연속되는 대문자 평균/길이/총계
58열 : 스팸 여부(spam, nonspam) 여기서 type 이 Target이다.

2. 데이터 분할

set.seed(1235)
DATA <- createDataPartition(y=spam$type, p=0.75, list=FALSE)   # Training Data : 75%
TrD  <- spam[DATA,]                                            # Training Data
TeD  <- spam[-DATA,]                                           # Test Data

prop.table(table(TrD$type))                                    # Proportion Class of Training Data


  nonspam      spam 
0.6059113 0.3940887

prop.table(table(TeD$type))                                    # Proportion Class of Test Data


 nonspam     spam 
0.606087 0.393913

3. 분할 기법 비교

ctrl1 <- trainControl(method="LOOCV")                              # Leave-One-Out Cross Validaion
system.time( modFit_loocv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl1) )

   user  system elapsed 
1243.83    5.08 1261.38

modFit_loocv

Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Leave-One-Out Cross-Validation 
Summary of sample sizes: 3450, 3450, 3450, 3450, 3450, 3450, ... 
Resampling results:

  Accuracy   Kappa    
  0.9223414  0.8365883

ctrl2 <- trainControl(method = "cv", number = 10)                  # 10-Fold Cross Validaion
system.time( modFit_cv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl2) )

   user  system elapsed 
   3.64    0.01    3.67

modFit_cv

Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 3106, 3106, 3106, 3105, 3106, 3106, ... 
Resampling results:

  Accuracy   Kappa    
  0.9237874  0.8396288

ctrl3 <- trainControl(method="repeatedcv", number=10, repeats=5)   # 10-Fold Cross Validaion을 5번 반복복
system.time( modFit_repeatedcv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl3) )

   user  system elapsed 
  16.33    0.31   16.84

modFit_repeatedcv

Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times) 
Summary of sample sizes: 3106, 3106, 3106, 3105, 3106, 3106, ... 
Resampling results:

  Accuracy   Kappa    
  0.9234401  0.8390102

ctrl4 <- trainControl(method="boot", number=10)                    # 10번 반복 붓스트랩스트랩
system.time( modFit_boot <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl4) )

   user  system elapsed 
   4.80    0.04    4.97

modFit_boot

Generalized Linear Model 

3451 samples
  57 predictor
   2 classes: 'nonspam', 'spam' 

No pre-processing
Resampling: Bootstrapped (10 reps) 
Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ... 
Resampling results:

  Accuracy   Kappa    
  0.9189991  0.8303191

Cross Validation and Out-of-Bag

1. 데이터 불러오기

2. 데이터 분할

3. 분할 기법 비교

Reuse