Description for Decision Tree using Package rpart
Tree-based Algorithm
실습 자료 : 유니버셜 은행의 고객 2,500명에 대한 자료(출처 : Data Mining for Business Intelligence, Shmueli et al. 2010)이며, 총 13개의 변수를 포함하고 있다. 이 자료에서 Target은
Personal Loan
이다.
pacman::p_load("data.table",
"tidyverse",
"dplyr",
"ggplot2", "GGally",
"caret",
"rpart", # For Decision Tree
"rattle", "rpart.plot", # For fancyRpartPlot
"visNetwork", "sparkline") # For visTree
UB <- fread("../Universal Bank_Main.csv") # 데이터 불러오기
UB %>%
as_tibble
# A tibble: 2,500 × 14
ID Age Experience Income `ZIP Code` Family CCAvg Education
<int> <int> <int> <int> <int> <int> <dbl> <int>
1 1 25 1 49 91107 4 1.6 1
2 2 45 19 34 90089 3 1.5 1
3 3 39 15 11 94720 1 1 1
4 4 35 9 100 94112 1 2.7 2
5 5 35 8 45 91330 4 1 2
6 6 37 13 29 92121 4 0.4 2
7 7 53 27 72 91711 2 1.5 2
8 8 50 24 22 93943 1 0.3 3
9 9 35 10 81 90089 3 0.6 2
10 10 34 9 180 93023 1 8.9 3
# ℹ 2,490 more rows
# ℹ 6 more variables: Mortgage <int>, `Personal Loan` <int>,
# `Securities Account` <int>, `CD Account` <int>, Online <int>,
# CreditCard <int>
UB %<>%
data.frame() %>% # Data Frame 형태로 변환
select(-1) # ID 변수 제거
# Convert to Factor
fac.col <- c("Family", "Education", "Securities.Account",
"CD.Account", "Online", "CreditCard",
# Target
"Personal.Loan")
UB <- UB %>%
mutate_at(fac.col, as.factor) # 범주형으로 변환
glimpse(UB) # 데이터 구조 확인
Rows: 2,500
Columns: 13
$ Age <int> 25, 45, 39, 35, 35, 37, 53, 50, 35, 34, 6…
$ Experience <int> 1, 19, 15, 9, 8, 13, 27, 24, 10, 9, 39, 5…
$ Income <int> 49, 34, 11, 100, 45, 29, 72, 22, 81, 180,…
$ ZIP.Code <int> 91107, 90089, 94720, 94112, 91330, 92121,…
$ Family <fct> 4, 3, 1, 1, 4, 4, 2, 1, 3, 1, 4, 3, 2, 4,…
$ CCAvg <dbl> 1.6, 1.5, 1.0, 2.7, 1.0, 0.4, 1.5, 0.3, 0…
$ Education <fct> 1, 1, 1, 2, 2, 2, 2, 3, 2, 3, 3, 2, 3, 2,…
$ Mortgage <int> 0, 0, 0, 0, 0, 155, 0, 0, 104, 0, 0, 0, 0…
$ Personal.Loan <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,…
$ Securities.Account <fct> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ CD.Account <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Online <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,…
$ CreditCard <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,…
ggpairs(UB,
columns = c("Age", "Experience", "Income", # 수치형 예측 변수
"ZIP.Code", "CCAvg", "Mortgage"),
aes(colour = Personal.Loan)) + # Target의 범주에 따라 색깔을 다르게 표현
theme_bw()
ggpairs(UB,
columns = c("Age", "Experience", "Income", # 수치형 예측 변수
"ZIP.Code", "CCAvg", "Mortgage"),
aes(colour = Personal.Loan), alpha = 0.8) + # Target의 범주에 따라 색깔을 다르게 표현
scale_colour_manual(values = c("#00798c", "#d1495b")) + # 특정 색깔 지정
scale_fill_manual(values = c("#00798c", "#d1495b")) + # 특정 색깔 지정
theme_bw()
ggpairs(UB,
columns = c("Age", "Income", # 수치형 예측 변수
"Family", "Education"), # 범주형 예측 변수
aes(colour = Personal.Loan, alpha = 0.8)) + # Target의 범주에 따라 색깔을 다르게 표현
scale_colour_manual(values = c("#E69F00", "#56B4E9")) + # 특정 색깔 지정
scale_fill_manual(values = c("#E69F00", "#56B4E9")) + # 특정 색깔 지정
theme_bw()
# Partition (Training Dataset : Test Dataset = 7:3)
y <- UB$Personal.Loan # Target
set.seed(200)
ind <- createDataPartition(y, p = 0.7, list = T) # Index를 이용하여 7:3으로 분할
UB.trd <- UB[ind$Resample1,] # Training Dataset
UB.ted <- UB[-ind$Resample1,] # Test Dataset
Package "rpart"
는 수정된 CART를 알고리듬으로 사용하며, CP
(Complexity Parameter)를 이용하여 최적의 모형을 찾아낸다. CP
는 최적의 나무 크기를 찾기 위한 모수로써, 노드를 분할할 때 분할 전과 비교하여 오분류율이 CP
값 이상으로 향상되지 않으면 분할을 멈춘다. 최적의 모형을 얻기 위해 필요한 CP
는 Cross Validation (CV) 기법을 이용하여 얻을 수 있으며, 해당 Package에서는 기본값으로 10-Fold CV를 이용한다. 마지막으로, Package "rpart"
는 가독성 좋은 그래프로 결과를 표현할 수 있어 의사결정나무를 시각화하기에 좋은 Package이다.
rpart(formula, data, method, ...)
formula
: Target과 예측 변수의 관계를 표현하기 위한 함수로써 일반적으로 Target ~ 예측 변수
의 형태로 표현한다.data
: formula
에 포함하고 있는 변수들의 데이터셋(Data Frame)method
: Target이 범주형이면 "class"
, 그렇지 않으면 "anova"
를 입력한다.set.seed(200) # For CV
rContol <- rpart.control(xval = 5) # xval : xval-Fold CV
UB.trd.rtree <- rpart(Personal.Loan ~ ., data = UB.trd,
method = "class",
control = rContol)
summary(UB.trd.rtree)
Call:
rpart(formula = Personal.Loan ~ ., data = UB.trd, method = "class",
control = rContol)
n= 1751
CP nsplit rel error xerror xstd
1 0.32500000 0 1.0000000 1.0000000 0.07060066
2 0.15000000 2 0.3500000 0.3666667 0.04427477
3 0.01944444 3 0.2000000 0.2222222 0.03473277
4 0.01111111 5 0.1611111 0.2055556 0.03343418
5 0.01000000 7 0.1388889 0.2055556 0.03343418
Variable importance
Income Education Family CCAvg CD.Account Mortgage
29 28 20 13 5 3
ZIP.Code Age Experience
1 1 1
Node number 1: 1751 observations, complexity param=0.325
predicted class=0 expected loss=0.1027984 P(node) =1
class counts: 1571 180
probabilities: 0.897 0.103
left son=2 (1366 obs) right son=3 (385 obs)
Primary splits:
Income < 106.5 to the left, improve=94.968130, (0 missing)
CCAvg < 2.95 to the left, improve=69.046700, (0 missing)
CD.Account splits as LR, improve=41.226300, (0 missing)
Mortgage < 293.5 to the left, improve=14.467750, (0 missing)
Education splits as LRR, improve= 7.681315, (0 missing)
Surrogate splits:
CCAvg < 3.25 to the left, agree=0.868, adj=0.400, (0 split)
Mortgage < 339.5 to the left, agree=0.802, adj=0.101, (0 split)
CD.Account splits as LR, agree=0.786, adj=0.026, (0 split)
Node number 2: 1366 observations, complexity param=0.01111111
predicted class=0 expected loss=0.01537335 P(node) =0.7801256
class counts: 1345 21
probabilities: 0.985 0.015
left son=4 (1277 obs) right son=5 (89 obs)
Primary splits:
CCAvg < 2.95 to the left, improve=9.2644320, (0 missing)
Income < 98.5 to the left, improve=3.5382100, (0 missing)
CD.Account splits as LR, improve=1.1082890, (0 missing)
Mortgage < 220.5 to the left, improve=0.6886484, (0 missing)
Experience < 38.5 to the left, improve=0.2000360, (0 missing)
Node number 3: 385 observations, complexity param=0.325
predicted class=0 expected loss=0.412987 P(node) =0.2198744
class counts: 226 159
probabilities: 0.587 0.413
left son=6 (242 obs) right son=7 (143 obs)
Primary splits:
Education splits as LRR, improve=111.984200, (0 missing)
Family splits as LLRR, improve= 73.753990, (0 missing)
CD.Account splits as LR, improve= 27.998710, (0 missing)
Income < 156.5 to the left, improve= 11.193100, (0 missing)
CCAvg < 6.635 to the right, improve= 4.353303, (0 missing)
Surrogate splits:
Family splits as LLRR, agree=0.743, adj=0.308, (0 split)
CD.Account splits as LR, agree=0.683, adj=0.147, (0 split)
Income < 173.5 to the left, agree=0.642, adj=0.035, (0 split)
CCAvg < 8.85 to the left, agree=0.634, adj=0.014, (0 split)
ZIP.Code < 90021.5 to the right, agree=0.631, adj=0.007, (0 split)
Node number 4: 1277 observations
predicted class=0 expected loss=0 P(node) =0.7292975
class counts: 1277 0
probabilities: 1.000 0.000
Node number 5: 89 observations, complexity param=0.01111111
predicted class=0 expected loss=0.2359551 P(node) =0.0508281
class counts: 68 21
probabilities: 0.764 0.236
left son=10 (73 obs) right son=11 (16 obs)
Primary splits:
Income < 98.5 to the left, improve=5.904956, (0 missing)
CD.Account splits as LR, improve=4.645443, (0 missing)
CCAvg < 3.15 to the right, improve=3.739411, (0 missing)
Experience < 31.5 to the left, improve=2.313744, (0 missing)
Mortgage < 86.5 to the left, improve=2.146298, (0 missing)
Surrogate splits:
Age < 62.5 to the left, agree=0.843, adj=0.125, (0 split)
Experience < 38.5 to the left, agree=0.843, adj=0.125, (0 split)
ZIP.Code < 90061.5 to the right, agree=0.843, adj=0.125, (0 split)
Node number 6: 242 observations, complexity param=0.15
predicted class=0 expected loss=0.1198347 P(node) =0.1382067
class counts: 213 29
probabilities: 0.880 0.120
left son=12 (211 obs) right son=13 (31 obs)
Primary splits:
Family splits as LLRR, improve=47.3076500, (0 missing)
CD.Account splits as LR, improve= 8.0685060, (0 missing)
Mortgage < 279.5 to the left, improve= 1.5428930, (0 missing)
CCAvg < 6.635 to the right, improve= 1.2562530, (0 missing)
ZIP.Code < 95057 to the left, improve= 0.9027978, (0 missing)
Surrogate splits:
CD.Account splits as LR, agree=0.880, adj=0.065, (0 split)
Mortgage < 566 to the left, agree=0.876, adj=0.032, (0 split)
Node number 7: 143 observations, complexity param=0.01944444
predicted class=1 expected loss=0.09090909 P(node) =0.08166762
class counts: 13 130
probabilities: 0.091 0.909
left son=14 (25 obs) right son=15 (118 obs)
Primary splits:
Income < 116.5 to the left, improve=11.1563600, (0 missing)
CCAvg < 2.4 to the left, improve= 2.6908830, (0 missing)
Experience < 2.5 to the left, improve= 2.4008740, (0 missing)
Age < 29.5 to the left, improve= 2.2161600, (0 missing)
CD.Account splits as LR, improve= 0.9500891, (0 missing)
Node number 10: 73 observations
predicted class=0 expected loss=0.1506849 P(node) =0.04169046
class counts: 62 11
probabilities: 0.849 0.151
Node number 11: 16 observations
predicted class=1 expected loss=0.375 P(node) =0.009137636
class counts: 6 10
probabilities: 0.375 0.625
Node number 12: 211 observations
predicted class=0 expected loss=0 P(node) =0.1205026
class counts: 211 0
probabilities: 1.000 0.000
Node number 13: 31 observations
predicted class=1 expected loss=0.06451613 P(node) =0.01770417
class counts: 2 29
probabilities: 0.065 0.935
Node number 14: 25 observations, complexity param=0.01944444
predicted class=0 expected loss=0.48 P(node) =0.01427756
class counts: 13 12
probabilities: 0.520 0.480
left son=28 (13 obs) right son=29 (12 obs)
Primary splits:
CCAvg < 2.4 to the left, improve=3.3646150, (0 missing)
ZIP.Code < 93060 to the right, improve=2.4938890, (0 missing)
Age < 29.5 to the left, improve=1.2447060, (0 missing)
Experience < 7 to the left, improve=1.0800000, (0 missing)
Education splits as -RL, improve=0.9605195, (0 missing)
Surrogate splits:
ZIP.Code < 94014.5 to the right, agree=0.88, adj=0.750, (0 split)
Age < 38.5 to the left, agree=0.72, adj=0.417, (0 split)
Experience < 18.5 to the left, agree=0.72, adj=0.417, (0 split)
Education splits as -RL, agree=0.72, adj=0.417, (0 split)
Income < 113.5 to the left, agree=0.68, adj=0.333, (0 split)
Node number 15: 118 observations
predicted class=1 expected loss=0 P(node) =0.06739006
class counts: 0 118
probabilities: 0.000 1.000
Node number 28: 13 observations
predicted class=0 expected loss=0.2307692 P(node) =0.007424329
class counts: 10 3
probabilities: 0.769 0.231
Node number 29: 12 observations
predicted class=1 expected loss=0.25 P(node) =0.006853227
class counts: 3 9
probabilities: 0.250 0.750
Result!
첫 번째 Table에서,
CP
: Complexity Parameter로 Training Dataset에 대한 오분류율과 나무 크기에 대한 패널티를 이용하여 아래와 같이 계산한다.
\[
\begin{align*}
cp = \frac{p(\text{incorrect}_{l}) - p(\text{incorrect}_{l+1})}{n(\text{splits}_{l+1}) - n(\text{splits}_{l})}.
\end{align*}
\]
CP
값은 다음과 같다.\[ cp = \frac{1.00-0.35}{2-0} = 0.325 \]
nsplit
: 분할 횟수rel error
: 현재 Depth에서 잘못 분류된 Case들의 비율(오분류율)xerror
: CV에 대한 오차xstd
: xerror
의 표준오차두 번째 Table Variable importance
은 변수 중요도에 대한 결과이며, 수치가 높을수록 중요한 변수임을 의미한다.
fancyRpartPlot(UB.trd.rtree) # Plot
visTree(UB.trd.rtree) # Network-based Plot
가지치기(Pruning)는 생성된 가지를 잘라내어 모형을 단순화하는 과정을 의미한다. 의사결정나무 학습에서는 Training Dataset을 이용하여 노드에 대한 분할과정이 최대한 정확한 분류를 위해 계속 반복된다. 하지만, 과도한 반복은 많은 가지를 생성하게 되어 모형이 복잡해지고, 결과적으로 과대적합이 발생할 수 있다. 여기서 과대적합은 Training Dataset에 대해서는 정확하게 분류하지만 새로운 데이터셋인 Test Dataset에 대해서는 예측 성능이 현저히 떨어지는 현상을 의미한다. 따라서 의사결정나무는 가지치기를 통해 모형을 단순화하고 과대적합을 방지하는 과정이 필요하다.
Package "rpart"
에서는 CP
의 최적값을 이용하여 가지치기를 수행할 수 있다. 함수 rpart()
를 이용하여 얻은 위의 결과를 기반으로 xerror
가 최소가 되는 CP
를 가지는 트리 모형을 생성한다.
table <- UB.trd.rtree$cptable # CP Table
low.error <- which.min(table[ , "xerror"]) # min("xerror")에 해당하는 Index 추출
cp.best <- table[low.error, "CP"] # min("xerror")에 해당하는 CP 값(CP의 최적값) 추출
# 가지치기 수행
UB.trd.prune.rtree <- prune(UB.trd.rtree, cp = cp.best) # prune(트리 모형, CP의 최적값)
UB.trd.prune.rtree$cptable # Best 모형의 CP Table
CP nsplit rel error xerror xstd
1 0.32500000 0 1.0000000 1.0000000 0.07060066
2 0.15000000 2 0.3500000 0.3666667 0.04427477
3 0.01944444 3 0.2000000 0.2222222 0.03473277
4 0.01111111 5 0.1611111 0.2055556 0.03343418
fancyRpartPlot(UB.trd.prune.rtree) # Plot
visTree(UB.trd.prune.rtree) # Network-based Plot
Caution!
모형 평가를 위해 Test Dataset
에 대한 예측 class/확률
이 필요하며, 함수 predict()
를 이용하여 생성한다.
# 예측 class 생성
test.rtree.class <- predict(UB.trd.prune.rtree,
newdata = UB.ted[,-9], # Test Dataset including Only 예측 변수
type = "class") # 예측 class 생성
test.rtree.class %>%
as_tibble
# A tibble: 749 × 1
value
<fct>
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
# ℹ 739 more rows
CM <- caret::confusionMatrix(test.rtree.class, UB.ted$Personal.Loan,
positive = "1") # confusionMatrix(예측 class, 실제 class, positive = "관심 class")
CM
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 671 14
1 2 62
Accuracy : 0.9786
95% CI : (0.9655, 0.9877)
No Information Rate : 0.8985
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.874
Mcnemar's Test P-Value : 0.00596
Sensitivity : 0.81579
Specificity : 0.99703
Pos Pred Value : 0.96875
Neg Pred Value : 0.97956
Prevalence : 0.10147
Detection Rate : 0.08278
Detection Prevalence : 0.08545
Balanced Accuracy : 0.90641
'Positive' Class : 1
# 예측 확률 생성
test.rtree.prob <- predict(UB.trd.prune.rtree,
newdata = UB.ted[,-9], # Test Dataset including Only 예측 변수
type = "prob") # 예측 확률 생성
test.rtree.prob %>%
as_tibble
# A tibble: 749 × 2
`0` `1`
<dbl> <dbl>
1 0.985 0.0154
2 0.985 0.0154
3 0.985 0.0154
4 0.985 0.0154
5 0.985 0.0154
6 0.985 0.0154
7 0.985 0.0154
8 0.985 0.0154
9 0.985 0.0154
10 0.985 0.0154
# ℹ 739 more rows
test.rtree.prob <- test.rtree.prob[,2] # "Personal.Loan = 1"에 대한 예측 확률
ac <- UB.ted$Personal.Loan # Test Dataset의 실제 class
pp <- as.numeric(test.rtree.prob) # 예측 확률을 수치형으로 변환
Caution!
Package "pROC"
를 통해 출력한 ROC 곡선은 다양한 함수를 이용해서 그래프를 수정할 수 있다.
# 함수 plot.roc() 이용
plot.roc(rtree.roc,
col="gray", # Line Color
print.auc = TRUE, # AUC 출력 여부
print.auc.col = "red", # AUC 글씨 색깔
print.thres = TRUE, # Cutoff Value 출력 여부
print.thres.pch = 19, # Cutoff Value를 표시하는 도형 모양
print.thres.col = "red", # Cutoff Value를 표시하는 도형의 색깔
auc.polygon = TRUE, # 곡선 아래 면적에 대한 여부
auc.polygon.col = "gray90") # 곡선 아래 면적의 색깔
# 함수 ggroc() 이용
ggroc(rtree.roc) +
annotate(geom = "text", x = 0.9, y = 1.0,
label = paste("AUC = ", auc),
size = 5,
color="red") +
theme_bw()
pacman::p_load("Epi")
# install_version("etm", version = "1.1", repos = "http://cran.us.r-project.org")
ROC(pp, ac, plot = "ROC") # ROC(예측 확률, 실제 class)
pacman::p_load("ROCR")
rtree.pred <- prediction(pp, ac) # prediction(예측 확률, 실제 class)
rtree.perf <- performance(rtree.pred, "tpr", "fpr") # performance(, "민감도", "1-특이도")
plot(rtree.perf, col = "gray") # ROC Curve
perf.auc <- performance(rtree.pred, "auc") # AUC
auc <- attributes(perf.auc)$y.values
legend("bottomright", legend = auc, bty = "n")
rtree.perf <- performance(rtree.pred, "lift", "rpp") # Lift Chart
plot(rtree.perf, main = "lift curve",
colorize = T, # Coloring according to cutoff
lwd = 2)
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".