Naive Bayes Classification

Data Mining

Description for Naive Bayes Classification

Yeongeun Jeon , Jung In Seo
2023-03-23

Naive Bayes의 장점


Naive Bayes의 단점


실습 자료 : 2004년 1월 한 달간 워싱턴 D.C.에서 출발하여 뉴욕에 도착한 2,201대의 항공편에 대한 자료(출처 : www.transtats.bts.gov)이며, 총 13개의 변수를 포함하고 있다. 13개의 변수 중 분석을 위해 사용할 변수는 CRS_DEP_TIME, CARRIER, DEST, ORIGIN, DAY_WEEK, Flight Status이며, TargetFlight Status이다.




1. 데이터 불러오기

pacman::p_load("data.table",
               "dplyr",  "tidyr", "magrittr",
               "ggplot2",
               "e1071",                                         # For naiveBayes
               "caret")                                         # For confusionMatrix    
                                              

delays.df <- fread("../FlightDelays.csv")                       # 데이터 불러오기


delays.df %>%
  as_tibble
# A tibble: 2,201 × 13
   CRS_DEP_TIME CARRIER DEP_TIME DEST  DISTANCE FL_DATE  FL_NUM ORIGIN
          <int> <chr>      <int> <chr>    <int> <chr>     <int> <chr> 
 1         1455 OH          1455 JFK        184 01/01/2…   5935 BWI   
 2         1640 DH          1640 JFK        213 01/01/2…   6155 DCA   
 3         1245 DH          1245 LGA        229 01/01/2…   7208 IAD   
 4         1715 DH          1709 LGA        229 01/01/2…   7215 IAD   
 5         1039 DH          1035 LGA        229 01/01/2…   7792 IAD   
 6          840 DH           839 JFK        228 01/01/2…   7800 IAD   
 7         1240 DH          1243 JFK        228 01/01/2…   7806 IAD   
 8         1645 DH          1644 JFK        228 01/01/2…   7810 IAD   
 9         1715 DH          1710 JFK        228 01/01/2…   7812 IAD   
10         2120 DH          2129 JFK        228 01/01/2…   7814 IAD   
# ℹ 2,191 more rows
# ℹ 5 more variables: Weather <int>, DAY_WEEK <int>,
#   DAY_OF_MONTH <int>, TAIL_NUM <chr>, `Flight Status` <chr>

2. 데이터 전처리

# 1. 출발시간(CRS_DEP_TIME)을 시간 단위(hourly)로 변환
# 예) 출발시간이 오전 6시와 오전 7시 사이라면, CRS_DEP_TIME = 6
delays.df %<>%
  data.frame() %>%                                                      # Data Frame 형태로 변환
  mutate(CRS_DEP_TIME = floor( CRS_DEP_TIME/100 ))                      

# 2. Convert to Factor
selected.var <- c("CRS_DEP_TIME", "CARRIER", "DEST",                    # 범주형 변수 선택
                  "ORIGIN", "DAY_WEEK", "Flight.Status")

delays.df %<>%
  mutate_at(selected.var, as.factor)                                    # 범주형으로 변환

# 3. Select Variables used for Analysis
delays.df1 <- delays.df %>%
  select(selected.var)                                                  # 분석에 사용할 변수만 선택

delays.df1 %>%
  as_tibble
# A tibble: 2,201 × 6
   CRS_DEP_TIME CARRIER DEST  ORIGIN DAY_WEEK Flight.Status
   <fct>        <fct>   <fct> <fct>  <fct>    <fct>        
 1 14           OH      JFK   BWI    4        ontime       
 2 16           DH      JFK   DCA    4        ontime       
 3 12           DH      LGA   IAD    4        ontime       
 4 17           DH      LGA   IAD    4        ontime       
 5 10           DH      LGA   IAD    4        ontime       
 6 8            DH      JFK   IAD    4        ontime       
 7 12           DH      JFK   IAD    4        ontime       
 8 16           DH      JFK   IAD    4        ontime       
 9 17           DH      JFK   IAD    4        ontime       
10 21           DH      JFK   IAD    4        ontime       
# ℹ 2,191 more rows
glimpse(delays.df1)                                                     # 데이터 구조 확인  
Rows: 2,201
Columns: 6
$ CRS_DEP_TIME  <fct> 14, 16, 12, 17, 10, 8, 12, 16, 17, 21, 21, 14,…
$ CARRIER       <fct> OH, DH, DH, DH, DH, DH, DH, DH, DH, DH, DH, DL…
$ DEST          <fct> JFK, JFK, LGA, LGA, LGA, JFK, JFK, JFK, JFK, J…
$ ORIGIN        <fct> BWI, DCA, IAD, IAD, IAD, IAD, IAD, IAD, IAD, I…
$ DAY_WEEK      <fct> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4…
$ Flight.Status <fct> ontime, ontime, ontime, ontime, ontime, ontime…

3. 데이터 탐색

# 데이터 구조 변환
visual.df <- pivot_longer(delays.df1,
                          cols = -Flight.Status,          # Target 제외
                          names_to = "Variables",         # 변수 이름이 입력될 새로운 열 이름
                          values_to = "values"            # 변수에 입력된 값에 대한 새로운 열 이름
                          )

visual.df
# A tibble: 11,005 × 3
   Flight.Status Variables    values
   <fct>         <chr>        <fct> 
 1 ontime        CRS_DEP_TIME 14    
 2 ontime        CARRIER      OH    
 3 ontime        DEST         JFK   
 4 ontime        ORIGIN       BWI   
 5 ontime        DAY_WEEK     4     
 6 ontime        CRS_DEP_TIME 16    
 7 ontime        CARRIER      DH    
 8 ontime        DEST         JFK   
 9 ontime        ORIGIN       DCA   
10 ontime        DAY_WEEK     4     
# ℹ 10,995 more rows
ggplot(visual.df, aes(values, fill = Flight.Status)) +
  facet_wrap(~ Variables, scales = "free") +
  geom_bar(position = "dodge") +
  scale_fill_manual(values=c("#E69F00", "#56B4E9")) +     # 막대 색깔 지정
  theme_bw()
ggplot(visual.df, aes(values, fill = Flight.Status)) +
  facet_wrap(~ Variables, scales = "free") +
  geom_bar(position = "fill") +
  scale_fill_manual(values=c("#E69F00", "#56B4E9")) +     # 막대 색깔 지정
  theme_bw()
ggplot(visual.df, aes(values, fill = Flight.Status)) +
  facet_wrap(~ Variables, scales = "free") +
  geom_bar(position = "stack") +
  scale_fill_manual(values=c("#E69F00", "#56B4E9")) +     # 막대 색깔 지정
  theme_bw()


4. 데이터 분할

# Partition (Training Dataset : Test Dataset = 7:3)
y <- delays.df1$Flight.Status                                           # Target

set.seed(200)
ind         <- createDataPartition(y, p = 0.7, list = T)                # Index를 이용하여 7:3으로 분할
delays.trd  <- delays.df1[ind$Resample1, ]                              # Training Dataset
delays.ted  <- delays.df1[-ind$Resample1, ]                             # Test Dataset

5. 모형 훈련

함수 naiveBayes를 이용하여 Naive Bayes를 수행할 수 있다. 함수에서 사용할 수 있는 자세한 옵션은 여기를 참고한다.

naiveBayes(formula, data, laplace = 0, ...)

delays.nb <- naiveBayes(Flight.Status ~ .,                              # Target ~ 예측 변수
                        data = delays.trd)
delays.nb

Naive Bayes Classifier for Discrete Predictors

Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)

A-priori probabilities:
Y
  delayed    ontime 
0.1945525 0.8054475 

Conditional probabilities:
         CRS_DEP_TIME
Y                  6          7          8          9         10
  delayed 0.02666667 0.04666667 0.07000000 0.02666667 0.01666667
  ontime  0.06682770 0.06521739 0.06924316 0.06199678 0.04991948
         CRS_DEP_TIME
Y                 11         12         13         14         15
  delayed 0.01666667 0.04333333 0.05333333 0.17000000 0.07000000
  ontime  0.03623188 0.06280193 0.07568438 0.09581320 0.06682770
         CRS_DEP_TIME
Y                 16         17         18         19         20
  delayed 0.08333333 0.15666667 0.03000000 0.07666667 0.02333333
  ontime  0.08132045 0.10064412 0.04508857 0.04186795 0.02415459
         CRS_DEP_TIME
Y                 21
  delayed 0.09000000
  ontime  0.05636071

         CARRIER
Y                 CO         DH         DL         MQ         OH
  delayed 0.07666667 0.32000000 0.11666667 0.18000000 0.01333333
  ontime  0.03542673 0.23349436 0.17632850 0.12962963 0.01529791
         CARRIER
Y                 RU         UA         US
  delayed 0.19333333 0.01333333 0.08666667
  ontime  0.18115942 0.01368760 0.21497585

         DEST
Y               EWR       JFK       LGA
  delayed 0.3766667 0.2100000 0.4133333
  ontime  0.2882448 0.1723027 0.5394525

         ORIGIN
Y                BWI        DCA        IAD
  delayed 0.07666667 0.52666667 0.39666667
  ontime  0.06280193 0.64492754 0.29227053

         DAY_WEEK
Y                 1         2         3         4         5         6
  delayed 0.1900000 0.1500000 0.1166667 0.1466667 0.1766667 0.0600000
  ontime  0.1264090 0.1336554 0.1545894 0.1876006 0.1626409 0.1336554
         DAY_WEEK
Y                 7
  delayed 0.1600000
  ontime  0.1014493

Result! 첫 번째 Table A-priori probabilities는 Target \(Y\)의 각 class에 대한 사전확률 \(P(Y=y)\)이고, 두 번째 Table Conditional probabilities는 예측 변수 \(X\)가 범주형일 때는 \(P(X|Y)\)를 나타내며, 연속형일 때는 평균과 표준편차를 나타낸다.


6. 모형 평가

Caution! 모형 평가를 위해 Test Dataset에 대한 예측 class/확률 이 필요하며, 함수 predict()를 이용하여 생성한다.

# 예측 class 생성 
pred.class <- predict(delays.nb,
                      newdata = delays.ted[,-6])         # 함수 predict에서는 Target 제외

# 예측 확률( 사후확률 P(Y|X) ) 생성
pred.prob  <- predict(delays.nb, 
                      newdata = delays.ted[,-6],         # 함수 predict에서는 Target 제외
                      type = "raw")                      
 
# Test Dataset의 실제값과 예측 결과
df <- data.frame(actual = delays.ted$Flight.Status, 
                 pred.class = pred.class, pred.prob)
df %>%
  as_tibble
# A tibble: 659 × 4
   actual pred.class delayed ontime
   <fct>  <fct>        <dbl>  <dbl>
 1 ontime ontime      0.303   0.697
 2 ontime ontime      0.302   0.698
 3 ontime ontime      0.0325  0.967
 4 ontime ontime      0.122   0.878
 5 ontime ontime      0.0702  0.930
 6 ontime ontime      0.162   0.838
 7 ontime ontime      0.0690  0.931
 8 ontime ontime      0.0969  0.903
 9 ontime ontime      0.187   0.813
10 ontime ontime      0.241   0.759
# ℹ 649 more rows
# 일요일 오전 10시와 오전 11시 사이에 DCA (레이건 국립공항)에서 LGA (라 과디아 공항)로 가는 DL (델타 항공)의 연착 여부 예측 결과
df[delays.ted$DAY_WEEK == 7 & delays.ted$CRS_DEP_TIME == 10 & delays.ted$ORIGIN == "DCA" &   
   delays.ted$DEST == "LGA" & delays.ted$CARRIER == "DL",]
    actual pred.class    delayed    ontime
215 ontime     ontime 0.05002181 0.9499782

6-1. ConfusionMatrix

CM <- caret::confusionMatrix(pred.class, delays.ted$Flight.Status,
                             positive = "delayed")            # confusionMatrix(예측 class, 실제 class, positive = "관심 class")
CM
Confusion Matrix and Statistics

          Reference
Prediction delayed ontime
   delayed      12     21
   ontime      116    510
                                          
               Accuracy : 0.7921          
                 95% CI : (0.7591, 0.8225)
    No Information Rate : 0.8058          
    P-Value [Acc > NIR] : 0.8256          
                                          
                  Kappa : 0.0755          
                                          
 Mcnemar's Test P-Value : 9.671e-16       
                                          
            Sensitivity : 0.09375         
            Specificity : 0.96045         
         Pos Pred Value : 0.36364         
         Neg Pred Value : 0.81470         
             Prevalence : 0.19423         
         Detection Rate : 0.01821         
   Detection Prevalence : 0.05008         
      Balanced Accuracy : 0.52710         
                                          
       'Positive' Class : delayed         
                                          



6-2. ROC 곡선

# 예측 확률( 사후확률 P(Y|X) ) 
pred.prob %>%
  as_tibble
# A tibble: 659 × 2
   delayed ontime
     <dbl>  <dbl>
 1  0.303   0.697
 2  0.302   0.698
 3  0.0325  0.967
 4  0.122   0.878
 5  0.0702  0.930
 6  0.162   0.838
 7  0.0690  0.931
 8  0.0969  0.903
 9  0.187   0.813
10  0.241   0.759
# ℹ 649 more rows
delayed.pred.prob <- pred.prob[,1]                            # "Flight.Status = delayed"에 대한 예측 확률

ac <- ifelse(delays.ted$Flight.Status == "delayed", 1, 0)     # 실제 class를 수치형으로 변환               
pp <- as.numeric(delayed.pred.prob)                           # 예측 확률을 수치형으로 변환

1) Package “pROC”

pacman::p_load("pROC")

nb.roc  <- roc(ac, pp, plot = T, col = "gray")                # roc(실제 class, 예측 확률)
auc     <- round(auc(nb.roc), 3)
legend("bottomright", legend = auc, bty = "n")

Caution! Package "pROC"를 통해 출력한 ROC 곡선은 다양한 함수를 이용해서 그래프를 수정할 수 있다.

# 함수 plot.roc() 이용
plot.roc(nb.roc,   
         col="gray",                                          # Line Color
         print.auc = TRUE,                                    # AUC 출력 여부
         print.auc.col = "red",                               # AUC 글씨 색깔
         print.thres = TRUE,                                  # Cutoff Value 출력 여부
         print.thres.pch = 19,                                # Cutoff Value를 표시하는 도형 모양
         print.thres.col = "red",                             # Cutoff Value를 표시하는 도형의 색깔
         auc.polygon = TRUE,                                  # 곡선 아래 면적에 대한 여부
         auc.polygon.col = "gray90")                          # 곡선 아래 면적의 색깔

# 함수 ggroc() 이용
ggroc(nb.roc) +
annotate(geom = "text", x = 0.9, y = 1.0,
label = paste("AUC = ", auc),
size = 5,
color="red") +
theme_bw()

2) Package “Epi”

pacman::p_load("Epi")       
# install_version("etm", version = "1.1", repos = "http://cran.us.r-project.org")

ROC(pp, ac, plot = "ROC")                                     # ROC(예측 확률, 실제 class)  

3) Package “ROCR”

pacman::p_load("ROCR")

pred <- prediction(pp, ac)                                    # prediction(예측 확률, 실제 class)  

perf <- performance(pred, "tpr", "fpr")                       # performance(, "민감도", "1-특이도")                      
plot(perf, col = "gray")                                      # ROC Curve

perf.auc   <- performance(pred, "auc")                        # AUC
auc        <- attributes(perf.auc)$y.values 
legend("bottomright", legend = auc, bty = "n")


6-3. 향상 차트

1) Package “ROCR”

# 향상 차트(Curve)
perf <- performance(pred, "lift", "rpp")       # Lift Chart
plot(perf, main = "lift curve",
     colorize = T,                             # Coloring according to cutoff
     lwd = 2)

2) Package “gains”

# 향상 차트(십분위)
pacman::p_load("gains")

gain <- gains(ac, pp)                           # gains(실제 class, 예측 확률)

barplot(gain$mean.resp / mean(ac), 
        names.arg = gain$depth, 
        xlab = "Percentile", 
        ylab = "Mean Response", 
        main = "Decile-wise lift chart")

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".