Logistic Regression의 장점

연속형 예측 변수와 범주형 예측 변수 모두 다룰 수 있다.
해석 가능한 모형이다.
예측 변수에 대해 정규분포 가정이 필요없다.

Logistic Regression의 단점

클래스가 완전히 분리되어 있는 경우에는 작동하지 않는다.
클래스에 대해 선형 분리를 가정하기 때문에 선형 분리가 불가능한 클래스 문제에는 성능이 좋지 않다.
각 예측 변수와 로그 오즈 간에 선형 관계를 가정하므로 어떤 예측 변수의 낮은 값과 높은 값이 동일한 클래스에 속한다면 중간 정도에 있는 값도 동일한 클래스에 속해야 한다.

실습 자료 : 유니버셜 은행의 고객 2,500명에 대한 자료(출처 : Data Mining for Business Intelligence, Shmueli et al. 2010)이며, 총 13개의 변수를 포함하고 있다. 이 자료에서 Target은 Personal Loan이다.

1. 데이터 불러오기

pacman::p_load("data.table", 
               "tidyverse", 
               "dplyr",
               "ggplot2", "GGally",
               "caret")

UB <- fread("../Universal Bank_Main.csv")                               # 데이터 불러오기

UB %>%
  as_tibble

# A tibble: 2,500 × 14
      ID   Age Experience Income `ZIP Code` Family CCAvg Education
   <int> <int>      <int>  <int>      <int>  <int> <dbl>     <int>
 1     1    25          1     49      91107      4   1.6         1
 2     2    45         19     34      90089      3   1.5         1
 3     3    39         15     11      94720      1   1           1
 4     4    35          9    100      94112      1   2.7         2
 5     5    35          8     45      91330      4   1           2
 6     6    37         13     29      92121      4   0.4         2
 7     7    53         27     72      91711      2   1.5         2
 8     8    50         24     22      93943      1   0.3         3
 9     9    35         10     81      90089      3   0.6         2
10    10    34          9    180      93023      1   8.9         3
# ℹ 2,490 more rows
# ℹ 6 more variables: Mortgage <int>, `Personal Loan` <int>,
#   `Securities Account` <int>, `CD Account` <int>, Online <int>,
#   CreditCard <int>

2. 데이터 전처리 I

UB %<>%
  data.frame() %>%                                                      # Data Frame 형태로 변환 
  mutate(Personal.Loan = ifelse(Personal.Loan == 1, "yes", "no")) %>%   # Target을 문자형 변수로 변환
  select(-1)                                                            # ID 변수 제거

# Convert to Factor
fac.col <- c("Family", "Education", "Securities.Account", 
             "CD.Account", "Online", "CreditCard",
             # Target
             "Personal.Loan")

UB <- UB %>% 
  mutate_at(fac.col, as.factor)                                         # 범주형으로 변환

glimpse(UB)                                                             # 데이터 구조 확인

Rows: 2,500
Columns: 13
$ Age                <int> 25, 45, 39, 35, 35, 37, 53, 50, 35, 34, 6…
$ Experience         <int> 1, 19, 15, 9, 8, 13, 27, 24, 10, 9, 39, 5…
$ Income             <int> 49, 34, 11, 100, 45, 29, 72, 22, 81, 180,…
$ ZIP.Code           <int> 91107, 90089, 94720, 94112, 91330, 92121,…
$ Family             <fct> 4, 3, 1, 1, 4, 4, 2, 1, 3, 1, 4, 3, 2, 4,…
$ CCAvg              <dbl> 1.6, 1.5, 1.0, 2.7, 1.0, 0.4, 1.5, 0.3, 0…
$ Education          <fct> 1, 1, 1, 2, 2, 2, 2, 3, 2, 3, 3, 2, 3, 2,…
$ Mortgage           <int> 0, 0, 0, 0, 0, 155, 0, 0, 104, 0, 0, 0, 0…
$ Personal.Loan      <fct> no, no, no, no, no, no, no, no, no, yes, …
$ Securities.Account <fct> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
$ CD.Account         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Online             <fct> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,…
$ CreditCard         <fct> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,…

3. 데이터 탐색

ggpairs(UB,                                             
        columns = c("Age", "Experience", "Income",        # 수치형 예측 변수
                    "ZIP.Code", "CCAvg", "Mortgage"),                            
        aes(colour = Personal.Loan)) +                    # Target의 범주에 따라 색깔을 다르게 표현
  theme_bw()

ggpairs(UB,                                               
        columns = c("Age", "Experience", "Income",        # 수치형 예측 변수
                    "ZIP.Code", "CCAvg", "Mortgage"), 
        aes(colour = Personal.Loan)) +                    # Target의 범주에 따라 색깔을 다르게 표현
  scale_colour_manual(values = c("#00798c", "#d1495b")) + # 특정 색깔 지정
  scale_fill_manual(values = c("#00798c", "#d1495b")) +   # 특정 색깔 지정
  theme_bw()

ggpairs(UB,                                               
        columns = c("Age", "Income",                      # 수치형 예측 변수
                    "Family", "Education"),               # 범주형 예측 변수
        aes(colour = Personal.Loan, alpha = 0.8)) +       # Target의 범주에 따라 색깔을 다르게 표현
  scale_colour_manual(values = c("#E69F00", "#56B4E9")) + # 특정 색깔 지정
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +   # 특정 색깔 지정
  theme_bw()

4. 데이터 분할

# Partition (Training Dataset : Test Dataset = 7:3)
y      <- UB$Personal.Loan                               # Target

set.seed(200)
ind    <- createDataPartition(y, p = 0.7, list = T)      # Index를 이용하여 7:3으로 분할
UB.trd <- UB[ind$Resample1,]                             # Training Dataset
UB.ted <- UB[-ind$Resample1,]                            # Test Dataset

5. 데이터 전처리 II

# Standardization
preProcValues <- preProcess(UB.trd, 
                            method = c("center", "scale"))  # Standardization 정의 -> Training Dataset에 대한 평균과 표준편차 계산 

UB.trd <- predict(preProcValues, UB.trd)                    # Standardization for Training Dataset
UB.ted <- predict(preProcValues, UB.ted)                    # Standardization for Test Dataset

glimpse(UB.trd)                                             # 데이터 구조 확인

Rows: 1,751
Columns: 13
$ Age                <dbl> -0.05431273, -0.57446728, -0.92123699, -0…
$ Experience         <dbl> -0.12175295, -0.46882565, -0.98943471, -1…
$ Income             <dbl> -0.85867297, -1.35649686, 0.56986515, -0.…
$ ZIP.Code           <dbl> -1.75250883, 0.88354520, 0.53745994, -1.0…
$ Family             <fct> 3, 1, 1, 4, 2, 1, 3, 1, 4, 3, 2, 4, 1, 4,…
$ CCAvg              <dbl> -0.25119120, -0.53150921, 0.42157204, -0.…
$ Education          <fct> 1, 1, 2, 2, 2, 3, 2, 3, 3, 2, 3, 2, 1, 3,…
$ Mortgage           <dbl> -0.5664192, -0.5664192, -0.5664192, -0.56…
$ Personal.Loan      <fct> no, no, no, no, no, no, no, yes, no, no, …
$ Securities.Account <fct> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,…
$ CD.Account         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ Online             <fct> 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,…
$ CreditCard         <fct> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…

glimpse(UB.ted)                                             # 데이터 구조 확인

Rows: 749
Columns: 13
$ Age                <dbl> -1.7881612, -0.7478521, 1.2460737, 0.8126…
$ Experience         <dbl> -1.68358012, -0.64236200, 0.83269699, 0.6…
$ Income             <dbl> -0.53400522, -0.96689556, -1.11840718, -1…
$ ZIP.Code           <dbl> -1.17304370, -0.59585545, 1.07366441, 0.8…
$ Family             <fct> 4, 4, 1, 1, 4, 1, 4, 4, 3, 3, 1, 3, 1, 4,…
$ CCAvg              <dbl> -0.19512759, -0.86789083, -0.25119120, -0…
$ Education          <fct> 1, 2, 3, 2, 2, 3, 3, 3, 3, 2, 1, 2, 1, 2,…
$ Mortgage           <dbl> -0.5664192, 0.9609885, -0.5664192, -0.566…
$ Personal.Loan      <fct> no, no, no, no, no, no, no, no, no, no, n…
$ Securities.Account <fct> 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,…
$ CD.Account         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,…
$ Online             <fct> 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,…
$ CreditCard         <fct> 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,…

6. 모형 훈련

Caution! 함수 glm에서 Logistic Regression은 Target이 2개의 클래스를 가질 때 “두 번째 클래스”에 속할 확률을 모델링하며, 범주형 예측 변수의 경우 더미 변환을 자동적으로 수행한다. 여기서, “두 번째 클래스”란 “Factor” 변환하였을 때 두 번째 수준(Level)을 의미한다. 예를 들어, “a”와 “b” 2개의 클래스를 가진 Target을 “Factor” 변환하였을 때 수준이 “a” “b”라면, 첫 번째 클래스는 “a”, 두 번째 클래스는 “b”가 된다.

logis.fit <- glm(Personal.Loan ~ . , data = UB.trd,
    family = "binomial")                               # For Logit Transformation

logis.fit                                              # Fitted Logistic Regression


Call:  glm(formula = Personal.Loan ~ ., family = "binomial", data = UB.trd)

Coefficients:
        (Intercept)                  Age           Experience  
           -7.09112              0.49203             -0.34968  
             Income             ZIP.Code              Family2  
            2.63841              0.07646             -0.36199  
            Family3              Family4                CCAvg  
            2.02072              1.68889              0.41075  
         Education2           Education3             Mortgage  
            3.40185              3.46841              0.05690  
Securities.Account1          CD.Account1              Online1  
           -1.39295              4.42317             -0.36693  
        CreditCard1  
           -1.32649  

Degrees of Freedom: 1750 Total (i.e. Null);  1735 Residual
Null Deviance:      1160 
Residual Deviance: 400.8    AIC: 432.8

summary(logis.fit)                                     # Summary for Fitted Logistic Regression


Call:
glm(formula = Personal.Loan ~ ., family = "binomial", data = UB.trd)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6854  -0.1834  -0.0739  -0.0227   4.0543  

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -7.09112    0.57141 -12.410  < 2e-16 ***
Age                  0.49203    1.29516   0.380 0.704018    
Experience          -0.34968    1.28719  -0.272 0.785884    
Income               2.63841    0.22575  11.687  < 2e-16 ***
ZIP.Code             0.07646    0.13144   0.582 0.560766    
Family2             -0.36199    0.40989  -0.883 0.377160    
Family3              2.02072    0.43698   4.624 3.76e-06 ***
Family4              1.68889    0.39404   4.286 1.82e-05 ***
CCAvg                0.41075    0.14253   2.882 0.003955 ** 
Education2           3.40185    0.43218   7.871 3.51e-15 ***
Education3           3.46841    0.44956   7.715 1.21e-14 ***
Mortgage             0.05690    0.10230   0.556 0.578032    
Securities.Account1 -1.39295    0.54179  -2.571 0.010140 *  
CD.Account1          4.42317    0.60476   7.314 2.60e-13 ***
Online1             -0.36693    0.28752  -1.276 0.201883    
CreditCard1         -1.32649    0.38234  -3.469 0.000522 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1159.8  on 1750  degrees of freedom
Residual deviance:  400.8  on 1735  degrees of freedom
AIC: 432.8

Number of Fisher Scoring iterations: 8

Result! 데이터 “UB.trd”의 Target “Personal.Loan”은 “no”와 “yes” 2개의 클래스를 가지며, “Factor” 변환하면 알파벳순으로 수준을 부여하기 때문에 “yes”가 두 번째 클래스가 된다. 즉, “yes”에 속할 확률(= 개인 대출 제의를 수락할 확률)을 \(p\)라고 할 때, 추정된 회귀계수를 이용하여 다음과 같은 모형식을 얻을 수 있다. \[ \begin{align*} \log{\frac{p}{1-p}} = &-7.091 + 0.492 Z_{\text{Age}} - 0.350 Z_{\text{Experience}} + 2.638 Z_{\text{Income}} \\ &+0.076 Z_{\text{ZIP.Code}} - 0.362 X_{\text{Family2}} + 2.021 X_{\text{Family3}} + 1.689 X_{\text{Family4}} \\ &+ 0.411 Z_{\text{CCAvg}} + 3.402 X_{\text{Education2}} + 3.468 X_{\text{Education3}} + 0.057 Z_{\text{Mortgage}} \\ &- 1.393 X_{\text{Securities.Account1}} + 4.423 X_{\text{CD.Account1}} - 0.367 X_{\text{Online1}} - 1.326 X_{\text{CreditCard1}} \end{align*} \] 여기서, \(Z_{\text{예측 변수}}\)는 표준화한 예측 변수, \(X_{\text{예측 변수}}\)는 더미 변수를 의미한다.
범주형 예측 변수(“Family”, “Education”, “Securities.Account”, “CD.Account”, “Online”, “CreditCard”)는 더미 변환이 수행되었는데, 예를 들어, \(X_{\text{Family2}}\)는 가족 수가 2명인 경우 “1”값을 가지고 2명이 아니면 “0”값을 가진다.

OR <- exp(coef(logis.fit))                             # Odds Ratio
CI <- exp(confint(logis.fit))                          # 95% Confidence Interval

cbind("Odds Ratio" = round(OR, 3),                     # round : 반올림
      round(CI, 3))

                    Odds Ratio  2.5 %  97.5 %
(Intercept)              0.001  0.000   0.002
Age                      1.636  0.117  19.060
Experience               0.705  0.061   9.732
Income                  13.991  9.208  22.367
ZIP.Code                 1.079  0.836   1.401
Family2                  0.696  0.308   1.547
Family3                  7.544  3.264  18.223
Family4                  5.413  2.548  12.001
CCAvg                    1.508  1.144   2.004
Education2              30.019 13.329  72.877
Education3              32.086 13.763  80.599
Mortgage                 1.059  0.865   1.292
Securities.Account1      0.248  0.080   0.673
CD.Account1             83.360 26.681 287.459
Online1                  0.693  0.394   1.219
CreditCard1              0.265  0.121   0.544

Result! 오즈비를 살펴보면, 나이(“Age”)를 표준화한 값이 1 증가할 경우, 개인 대출 제의를 수락할 가능성이 1.636배 증가한다. 반면, 경력(“Experience”)을 표준화한 값이 1 증가할 경우, 개인 대출 제의를 수락할 가능성이 1.418(=1/0.705)배 감소한다.

7. 모형 평가

Caution! 모형 평가를 위해 Test Dataset에 대한 예측 class/확률 이 필요하며, 함수 predict()를 이용하여 생성한다.

# 예측 확률 생성
test.logis.prob <- predict(logis.fit, 
                           newdata = UB.ted[,-9],         # Test Dataset including Only 예측 변수       
                           type = "response")             # 예측 확률 생성 

test.logis.prob %>%                                       # "Personal.Loan = yes"에 대한 예측 확률
  as_tibble

# A tibble: 749 × 1
       value
       <dbl>
 1 0.000167 
 2 0.00446  
 3 0.000336 
 4 0.0000715
 5 0.00548  
 6 0.00143  
 7 0.0211   
 8 0.104    
 9 0.0827   
10 0.0175   
# ℹ 739 more rows

# 예측 class 생성
logis.pred <- ifelse(test.logis.prob > 0.5, "yes", "no") %>%  # "Personal.Loan = yes"에 대한 예측 확률이 0.5 초과하면 "yes", 0.5를 넘기지 못하면 "no"로 분류
  factor                                                      # 범주형으로 변환
          
logis.pred %>%                                      
  as_tibble

# A tibble: 749 × 1
   value
   <fct>
 1 no   
 2 no   
 3 no   
 4 no   
 5 no   
 6 no   
 7 no   
 8 no   
 9 no   
10 no   
# ℹ 739 more rows

7-1. ConfusionMatrix

CM   <- caret::confusionMatrix(logis.pred, UB.ted$Personal.Loan, 
                               positive = "yes")        # confusionMatrix(예측 class, 실제 class, positive = "관심 class")
CM

Confusion Matrix and Statistics

          Reference
Prediction  no yes
       no  667  21
       yes   6  55
                                         
               Accuracy : 0.964          
                 95% CI : (0.948, 0.9761)
    No Information Rate : 0.8985         
    P-Value [Acc > NIR] : 1.447e-11      
                                         
                  Kappa : 0.7833         
                                         
 Mcnemar's Test P-Value : 0.007054       
                                         
            Sensitivity : 0.72368        
            Specificity : 0.99108        
         Pos Pred Value : 0.90164        
         Neg Pred Value : 0.96948        
             Prevalence : 0.10147        
         Detection Rate : 0.07343        
   Detection Prevalence : 0.08144        
      Balanced Accuracy : 0.85738        
                                         
       'Positive' Class : yes

7-2. ROC 곡선

ac  <- UB.ted$Personal.Loan                               # Test Dataset의 실제 class 
pp  <- as.numeric(test.logis.prob)                        # 예측 확률을 수치형으로 변환

1) Package “pROC”

pacman::p_load("pROC")

logis.roc  <- roc(ac, pp, plot = T, col = "gray")         # roc(실제 class, 예측 확률)
auc        <- round(auc(logis.roc), 3)
legend("bottomright", legend = auc, bty = "n")

Caution! Package "pROC"를 통해 출력한 ROC 곡선은 다양한 함수를 이용해서 그래프를 수정할 수 있다.

# 함수 plot.roc() 이용
plot.roc(logis.roc,   
         col="gray",                                      # Line Color
         print.auc = TRUE,                                # AUC 출력 여부
         print.auc.col = "red",                           # AUC 글씨 색깔
         print.thres = TRUE,                              # Cutoff Value 출력 여부
         print.thres.pch = 19,                            # Cutoff Value를 표시하는 도형 모양
         print.thres.col = "red",                         # Cutoff Value를 표시하는 도형의 색깔
         auc.polygon = TRUE,                              # 곡선 아래 면적에 대한 여부
         auc.polygon.col = "gray90")                      # 곡선 아래 면적의 색깔

# 함수 ggroc() 이용
ggroc(logis.roc) +
annotate(geom = "text", x = 0.9, y = 1.0,
label = paste("AUC = ", auc),
size = 5,
color="red") +
theme_bw()

2) Package “Epi”

pacman::p_load("Epi")       
# install_version("etm", version = "1.1", repos = "http://cran.us.r-project.org")

ROC(pp, ac, plot = "ROC")                                  # ROC(예측 확률, 실제 class)

3) Package “ROCR”

pacman::p_load("ROCR")

logis.pred <- prediction(pp, ac)                          # prediction(예측 확률, 실제 class)    

logis.perf <- performance(logis.pred, "tpr", "fpr")       # performance(, "민감도", "1-특이도")                      
plot(logis.perf, col = "gray")                            # ROC Curve

perf.auc   <- performance(logis.pred, "auc")              # AUC
auc        <- attributes(perf.auc)$y.values 
legend("bottomright", legend = auc, bty = "n")

7-3. 향상 차트

1) Package “ROCR”

logis.pred <- performance(logis.pred, "lift", "rpp")      # Lift Chart
plot(logis.pred, main = "lift curve", 
     colorize = T,                                        # Coloring according to cutoff
     lwd = 2)

Logistic Regression using glm