Cross Validation and Out-of-Bag
예제로 사용될 데이터는 R에 내장되어 있는 “SPAM” 데이터이다. 미국 캘리포니아 Pal Alto 지역에서 우체국, 개개인으로 부터 spam mail 수집하였는데, 여기서 spam mial은 수신자의 의사와 상관없이 전송되는 불필요한 광고성 e-mail이다. e-mail에 포함된 단어, 특수문자, 대문자 빈도로 spam mail 분류하였으며, 총 4601개의 관측치와 58개의 변수 데이터로 이루어져 있다.
pacman::p_load("caret", # For train
"kernlab", # For spam data
"dplyr", # For glimpse
"microbenchmark", # For microbenchmark
"ggplot2" # For ggplot
)
data(spam) # Load Data
glimpse(spam) # Structure of data
Rows: 4,601
Columns: 58
$ make <dbl> 0.00, 0.21, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ address <dbl> 0.64, 0.28, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ all <dbl> 0.64, 0.50, 0.71, 0.00, 0.00, 0.00, 0.00, ~
$ num3d <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ our <dbl> 0.32, 0.14, 1.23, 0.63, 0.63, 1.85, 1.92, ~
$ over <dbl> 0.00, 0.28, 0.19, 0.00, 0.00, 0.00, 0.00, ~
$ remove <dbl> 0.00, 0.21, 0.19, 0.31, 0.31, 0.00, 0.00, ~
$ internet <dbl> 0.00, 0.07, 0.12, 0.63, 0.63, 1.85, 0.00, ~
$ order <dbl> 0.00, 0.00, 0.64, 0.31, 0.31, 0.00, 0.00, ~
$ mail <dbl> 0.00, 0.94, 0.25, 0.63, 0.63, 0.00, 0.64, ~
$ receive <dbl> 0.00, 0.21, 0.38, 0.31, 0.31, 0.00, 0.96, ~
$ will <dbl> 0.64, 0.79, 0.45, 0.31, 0.31, 0.00, 1.28, ~
$ people <dbl> 0.00, 0.65, 0.12, 0.31, 0.31, 0.00, 0.00, ~
$ report <dbl> 0.00, 0.21, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ addresses <dbl> 0.00, 0.14, 1.75, 0.00, 0.00, 0.00, 0.00, ~
$ free <dbl> 0.32, 0.14, 0.06, 0.31, 0.31, 0.00, 0.96, ~
$ business <dbl> 0.00, 0.07, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ email <dbl> 1.29, 0.28, 1.03, 0.00, 0.00, 0.00, 0.32, ~
$ you <dbl> 1.93, 3.47, 1.36, 3.18, 3.18, 0.00, 3.85, ~
$ credit <dbl> 0.00, 0.00, 0.32, 0.00, 0.00, 0.00, 0.00, ~
$ your <dbl> 0.96, 1.59, 0.51, 0.31, 0.31, 0.00, 0.64, ~
$ font <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num000 <dbl> 0.00, 0.43, 1.16, 0.00, 0.00, 0.00, 0.00, ~
$ money <dbl> 0.00, 0.43, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ hp <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ hpl <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ george <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num650 <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ lab <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ labs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ telnet <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num857 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ data <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ num415 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ num85 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ technology <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ num1999 <dbl> 0.00, 0.07, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ parts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ pm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ direct <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ cs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ meeting <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ original <dbl> 0.00, 0.00, 0.12, 0.00, 0.00, 0.00, 0.00, ~
$ project <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ~
$ re <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ edu <dbl> 0.00, 0.00, 0.06, 0.00, 0.00, 0.00, 0.00, ~
$ table <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ conference <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ charSemicolon <dbl> 0.000, 0.000, 0.010, 0.000, 0.000, 0.000, ~
$ charRoundbracket <dbl> 0.000, 0.132, 0.143, 0.137, 0.135, 0.223, ~
$ charSquarebracket <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ~
$ charExclamation <dbl> 0.778, 0.372, 0.276, 0.137, 0.135, 0.000, ~
$ charDollar <dbl> 0.000, 0.180, 0.184, 0.000, 0.000, 0.000, ~
$ charHash <dbl> 0.000, 0.048, 0.010, 0.000, 0.000, 0.000, ~
$ capitalAve <dbl> 3.756, 5.114, 9.821, 3.537, 3.537, 3.000, ~
$ capitalLong <dbl> 61, 101, 485, 40, 40, 15, 4, 11, 445, 43, ~
$ capitalTotal <dbl> 278, 1028, 2259, 191, 191, 54, 112, 49, 12~
$ type <fct> spam, spam, spam, spam, spam, spam, spam, ~
type
이 Target이다.set.seed(1235)
DATA <- createDataPartition(y=spam$type, p=0.75, list=FALSE) # Training Data : 75%
TrD <- spam[DATA,] # Training Data
TeD <- spam[-DATA,] # Test Data
prop.table(table(TrD$type)) # Proportion Class of Training Data
nonspam spam
0.6059113 0.3940887
prop.table(table(TeD$type)) # Proportion Class of Test Data
nonspam spam
0.606087 0.393913
ctrl1 <- trainControl(method="LOOCV") # Leave-One-Out Cross Validaion
system.time( modFit_loocv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl1) )
user system elapsed
1243.83 5.08 1261.38
modFit_loocv
Generalized Linear Model
3451 samples
57 predictor
2 classes: 'nonspam', 'spam'
No pre-processing
Resampling: Leave-One-Out Cross-Validation
Summary of sample sizes: 3450, 3450, 3450, 3450, 3450, 3450, ...
Resampling results:
Accuracy Kappa
0.9223414 0.8365883
ctrl2 <- trainControl(method = "cv", number = 10) # 10-Fold Cross Validaion
system.time( modFit_cv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl2) )
user system elapsed
3.64 0.01 3.67
modFit_cv
Generalized Linear Model
3451 samples
57 predictor
2 classes: 'nonspam', 'spam'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 3106, 3106, 3106, 3105, 3106, 3106, ...
Resampling results:
Accuracy Kappa
0.9237874 0.8396288
ctrl3 <- trainControl(method="repeatedcv", number=10, repeats=5) # 10-Fold Cross Validaion을 5번 반복복
system.time( modFit_repeatedcv <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl3) )
user system elapsed
16.33 0.31 16.84
modFit_repeatedcv
Generalized Linear Model
3451 samples
57 predictor
2 classes: 'nonspam', 'spam'
No pre-processing
Resampling: Cross-Validated (10 fold, repeated 5 times)
Summary of sample sizes: 3106, 3106, 3106, 3105, 3106, 3106, ...
Resampling results:
Accuracy Kappa
0.9234401 0.8390102
ctrl4 <- trainControl(method="boot", number=10) # 10번 반복 붓스트랩스트랩
system.time( modFit_boot <- train(type ~., data=TrD, method="glm", metric="Accuracy", trControl = ctrl4) )
user system elapsed
4.80 0.04 4.97
modFit_boot
Generalized Linear Model
3451 samples
57 predictor
2 classes: 'nonspam', 'spam'
No pre-processing
Resampling: Bootstrapped (10 reps)
Summary of sample sizes: 3451, 3451, 3451, 3451, 3451, 3451, ...
Resampling results:
Accuracy Kappa
0.9189991 0.8303191
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".