Method of Model Evaluation for Categorical Target
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
pred <- rep(0, length(pp)) # rep(0, 5) : 0을 5개 생성생성
cv <- 0.5 # Cutoff Value(분류 기준값)
pred[pp>=cv]<- 1 # 예측확률>=cv이면 "1", 그렇지 않으면 "0"
table(ac, pred) # table(행, 열) : 오분류 행렬
pred
ac 0 1
0 10 2
1 1 11
CM <- function(cv) {
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
pred <- rep(0, length(pp)) # rep(0, 5) : 0을 5개 생성생성
pred[pp>=cv] <- 1 # 예측확률>=cv이면 "1", 그렇지 않으면 "0"
table(ac, pred) # table(행, 열) : 오분류 행렬 }
CM(0.75)
pred
ac 0 1
0 11 1
1 5 7
pacman::p_load("e1071", "caret") # For confusionMatrix
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
pred <- rep(0, length(pp)) # rep(0, 5) : 0을 5개 생성생성
cv <- 0.5 # Cutoff Value(분류 기준값)
pred[pp>=cv] <- 1 # 예측확률>=cv이면 "1", 그렇지 않으면 "0"
pred <- as.factor(pred) # as.factor:범주형으로 변환ac <- as.factor(ac)
confusionMatrix(pred, ac, positive="1") # confusionMatrix (예측 클래스, 실제 클래스, positive=“관심 클래스”)관심 클래스”)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 10 1
1 2 11
Accuracy : 0.875
95% CI : (0.6764, 0.9734)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.0001386
Kappa : 0.75
Mcnemar's Test P-Value : 1.0000000
Sensitivity : 0.9167
Specificity : 0.8333
Pos Pred Value : 0.8462
Neg Pred Value : 0.9091
Prevalence : 0.5000
Detection Rate : 0.4583
Detection Prevalence : 0.5417
Balanced Accuracy : 0.8750
'Positive' Class : 1
CM<-function(cv) {
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
pred<-rep(0, length(pp)) # rep(0, 5) : 0을 5개 생성생성
pred[pp>=cv]<-1 # 예측확률>=cv이면 "1", 그렇지 않으면 "0"
pred <- as.factor(pred) # as.factor:범주형으로 변환 ac <- as.factor(ac)
confusionMatrix(pred, ac, positive="1") # confusionMatrix (예측 클래스, 실제 클래스, positive=“관심 클래스”)
}
CM(0.5)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 10 1
1 2 11
Accuracy : 0.875
95% CI : (0.6764, 0.9734)
No Information Rate : 0.5
P-Value [Acc > NIR] : 0.0001386
Kappa : 0.75
Mcnemar's Test P-Value : 1.0000000
Sensitivity : 0.9167
Specificity : 0.8333
Pos Pred Value : 0.8462
Neg Pred Value : 0.9091
Prevalence : 0.5000
Detection Rate : 0.4583
Detection Prevalence : 0.5417
Balanced Accuracy : 0.8750
'Positive' Class : 1
pacman::p_load("ROCR")
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
pred <- prediction(pp,ac) # prediction(예측 확률, 실제 클래스)스)
perf <- performance(pred, "tpr", "fpr", colorize=T) # tpr : 민감도 도
# fpr : 1-특이도
plot(perf, col="blue", lwd=3)
abline(0,1,lty=2)
perf.auc <- performance(pred, "auc")
attributes(perf.auc)$y.values
[[1]]
[1] 0.9375
detach(package:ROCR)
pacman::p_load("Epi", "devtools") # For ROC
# install_version("etm", version="1.1", repos = "http://cran.us.r-project.org")
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
ROC(pp, ac, plot="ROC") # ROC(예측 확률, 실제 클래스)스)
pacman::p_load("moonBook") # For data "radial"
data(radial)
ROC(form=male~height,data=radial,plot="ROC") # ROC(모형) : 키(height)에 따라 남자와 여자를 구분구분
detach(package:Epi)
pacman::p_load("pROC") # For roc and roc.test
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
perf.roc <- roc(ac, pp, ci=T, percent=F, plot=T, col="blue") # roc(실제 클래스, 예측 확률)률)
auc(perf.roc) # AUC
Area under the curve: 0.9375
pacman::p_load("pROC", "moonBook") # For roc and roc.test
b1 <- roc(male~height, radial, ci=T, percent=F, plot=T) # 키(height)에 따라 남자와 여자를 구분 구분
b2 <- roc(male~weight, radial, ci=T, percent=F, plot=T, add=TRUE, col="red") # 몸무게(weight)에 따라 남자와 여자를 구분를 구분
roc.test(b1,b2,plot=T) # 두 ROC곡선의 AUC 동일성 검정정
DeLong's test for two correlated ROC curves
data: b1 and b2
Z = 3.9231, p-value = 8.743e-05
alternative hypothesis: true difference in AUC is not equal to 0
sample estimates:
AUC of roc1 AUC of roc2
0.9510468 0.8075739
pacman::p_load("gains") # For gains
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
gain <- gains(ac, pp, groups=length(ac))
plot(c(0, gain$cume.pct.of.total*sum(ac)) ~ c(0, gain$cume.obs),
xlab = "데이터 개수", ylab = "누적", type="l")
lines(c(0,sum(ac))~c(0,length(ac)), col="gray", lty=2)
pacman::p_load("gains") # For gains
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
gain <- gains(ac, pp)
barplot(gain$mean.resp / mean(ac), names.arg = gain$depth,
xlab = "Percentile",
ylab = "Mean Response",
main = "Decile-wise lift chart")
pacman::p_load("ROCR")
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
pred <- prediction(pp,ac) # prediction(예측 확률, 실제 클래스)스)
perf <- performance(pred, "lift", "rpp") # lift : lift 값
# rpp : 양성 예측의 비율
plot(perf, main="lift curve", colorize=T, lwd=2) # Lift Chart
detach(package:ROCR)
pacman::p_load("lift")
pp <- c(0.9959, 0.9875, 0.9844, 0.9804, 0.9481, 0.8893,
0.8476, 0.7628, 0.7070, 0.6807, 0.6563, 0.6224,
0.5055, 0.4713, 0.3371, 0.2180, 0.1992, 0.1495,
0.0480, 0.0383, 0.0248, 0.0218, 0.0161, 0.0036) # pp : 클래스 1에 속할 확률확률
ac <- c(1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0) # ac : 실제 클래스래스
plotLift(pp, ac, cumulative = TRUE, n.buckets = 24) # plotlift(예측 확률, 실제 클래스, n.buckets = 케이스 수))
TopDecileLift(predicted=pp, labels=ac) # Top 10% 향상도 출력
[1] 2
detach(package:lift)
pacman::p_load("caret")
# 비례식을 이용한 과샘플링 결과의 confusionMatrix NAOS.table <- as.table( matrix(c(390, 110, 80, 420),2,2) )
confusionMatrix(OS.table)
Confusion Matrix and Statistics
A B
A 390 80
B 110 420
Accuracy : 0.81
95% CI : (0.7843, 0.8339)
No Information Rate : 0.5
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.62
Mcnemar's Test P-Value : 0.03539
Sensitivity : 0.7800
Specificity : 0.8400
Pos Pred Value : 0.8298
Neg Pred Value : 0.7925
Prevalence : 0.5000
Detection Rate : 0.3900
Detection Prevalence : 0.4700
Balanced Accuracy : 0.8100
'Positive' Class : A
# 과샘플링 가중치를 이용한 결과의 confusionMatrix NAOri.table <- as.table( matrix(c(764.4, 215.6, 3.2, 16.8),2,2) )
confusionMatrix(Ori.table)
Confusion Matrix and Statistics
A B
A 764.4 3.2
B 215.6 16.8
Accuracy : 0.7812
95% CI : (NA, NA)
No Information Rate : NA
P-Value [Acc > NIR] : NA
Kappa : 0.1
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.78000
Specificity : 0.84000
Pos Pred Value : 0.99583
Neg Pred Value : 0.07229
Prevalence : 0.98000
Detection Rate : 0.76440
Detection Prevalence : 0.76760
Balanced Accuracy : 0.81000
'Positive' Class : A
예제로 사용될 데이터는 R에 내장되어 있는 “iris” 데이터이다.
pacman::p_load("rpart", "randomForest", "caret", "kernlab")
set.seed(12345) # 똑같은 결과 나오게 하기 위해 seed 고정
DATA <- createDataPartition(y=iris$Species, p=0.7, list=FALSE) # Training Data로 70% 분할TrD <- iris[DATA,] # Training Data
TeD <- iris[-DATA,] # Test Data
set.seed(12345) # 똑같은 결과 나오게 하기 위해 seed 고정
control <- trainControl(method = 'cv', number = 10) # 10-fold-Cross Validation
tree <- train(Species~., data = TrD, method = 'rpart',
metric = 'Accuracy', trControl=control) # 의사결정나무무
rf <- train(Species~., data = TrD, method = 'rf',
metric = 'Accuracy', trControl = control) # 랜덤포레스트스트
svm <- train(Species~., data = TrD, method = 'svmRadial',
metric = 'Accuracy', trControl = control) # 서포트벡터머신
knn <- train(Species~., data = TrD, method = 'knn',
metric = 'Accuracy', trControl = control) # K-최근접 이웃NAresamp <- resamples(list(의사결정나무==treee, 랜덤포레=트rf rf,
SVM = svm, kNN = knn)) # Resampling Results
summary(resamp)
Call:
summary.resamples(object = resamp)
Models: 의사결정나무, 랜덤포레스트, SVM, kNN
Number of resamples: 10
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
의사결정나무 0.8181818 1.0000 1 0.9707071 1 1 0
랜덤포레스트 0.7777778 0.9250 1 0.9577778 1 1 0
SVM 0.9000000 0.9375 1 0.9716667 1 1 0
kNN 0.9000000 1.0000 1 0.9809091 1 1 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
의사결정나무 0.7250000 1.0000000 1 0.9558333 1 1 0
랜덤포레스트 0.6666667 0.8880597 1 0.9365898 1 1 0
SVM 0.8461538 0.9062500 1 0.9571900 1 1 0
kNN 0.8507463 1.0000000 1 0.9714944 1 1 0
sort(resamp, decreasing = TRUE) # Resampling Results를 내림차순으로 정렬정렬
[1] "kNN" "SVM" "의사결정나무" "랜덤포레스트"
dotplot(resamp) # dot plot
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".