Unbalanced Target

Machine Learning

Solution for Unlanced Categorical Target

Yeongeun Jeon , Jeongwook Lee , Jung In Seo
09-15-2020

1. 전 구간에서 랜덤하게 분할

1-1. R function “sample”

sample(x, size, replace = FALSE, prob = NULL)

1-2. 예제

# Partition (Traning Data : Test Data = 7:3)

set.seed(100) 

x <- rnorm(100,0,1)                        # Randomize 100 from normal probability variables with zero mean and one variance
set.seed(100)                              # Fixed seed for the same result


ind <- sample(2, size=length(x), 
              replace=T, prob=c(0.7, 0.3)) # The probability of number 1 being extracted is 0.7, and number 2 being extracted is 0.3 
                                           


trd <- x[ind==1]                           # Extract "x" for location 1 in the ind / Training Data


trd
 [1] -0.50219235  0.13153117 -0.07891709  0.88678481  0.11697127
 [6]  0.31863009  0.71453271 -0.82525943 -0.35986213  0.08988614
[11] -0.20163395  0.73984050 -0.02931671 -0.38885425  0.51085626
[16] -0.91381419  2.31029682 -0.43808998  0.26196129 -0.81437912
[21] -0.43845057 -1.15772946  0.24707599 -0.09111356 -0.13792961
[26] -0.69001432  0.18290768  0.41732329  0.97020202 -0.10162924
[31] -0.52228335  1.32223096  0.04377907 -1.87865588 -0.44706218
[36] -1.73859795  0.17886485  1.89746570 -2.27192549  0.98046414
[41] -1.39882562  1.82487242  1.38129873 -0.83885188 -0.26199577
[46] -0.06884403  2.58195893  0.12983414 -0.71302498  0.63799424
[51]  0.20169159 -0.06991695 -0.09248988  0.44890327 -1.06435567
[56] -1.16241932 -2.06209602  0.01274972 -2.07440475  0.89682227
[61] -0.04999577  0.70958158 -0.15790503  0.81736208  1.72717575
[66]  1.42830143 -0.89295740 -1.15757124  2.44568276 -0.83249580
[71]  0.41351985 -1.17868314
ted <- x[ind==2]                           # Extract "x" for location 2 in the ind출 / Test Data

ted 
 [1] -0.58179068  0.09627446  0.12337950  0.76406062  0.77340460
 [6] -0.72022155  0.23094453  1.75737562 -0.11119350 -0.22179423
[11]  1.06540233  1.40320349 -1.77677563  0.62286739 -0.36344033
[16]  1.31906574 -0.37888356  1.64852175 -1.08752835  0.27053949
[21]  1.00845187 -1.34534931 -1.93121153  0.21636787 -0.10377029
[26] -0.55712229 -0.53029645 -1.17403476

2. 각 구간 안에서 표본 추출

2-1. 예제

set.seed(100) 


x <- rnorm(100,0,1)               


set.seed(100)     


ind <- c(sample(1:50,40), 
         sample(51:length(x),35))  # 40 out of 1 to 50 and 35 out of 51 to 100  randomly selected
trd <- x[ind]                     


trd
 [1] -0.35986213  0.41732329  1.31906574 -0.81437912  0.73984050
 [6]  0.62286739  0.26196129  0.76406062  0.31863009  0.88678481
[11]  1.40320349 -0.11119350 -0.58179068  0.04377907  0.51085626
[16]  0.09627446  0.71453271 -0.22179423 -0.36344033  0.13153117
[21] -0.91381419 -0.10162924 -1.15772946  2.31029682 -0.02931671
[26] -0.43808998  0.18290768 -0.43845057  0.08988614 -0.72022155
[31] -0.09111356  0.97020202 -0.20163395 -0.50219235  0.12337950
[36] -0.13792961 -0.69001432 -1.77677563  0.24707599 -0.82525943
[41] -0.55712229 -1.17403476  1.64852175 -0.89295740 -1.39882562
[46] -0.44706218  1.38129873  0.81736208 -0.09248988  2.58195893
[51] -1.17868314  0.41351985  0.17886485 -0.15790503 -1.08752835
[56] -2.27192549  1.89746570 -0.06884403 -0.06991695 -2.06209602
[61] -0.71302498  1.72717575 -1.06435567  0.20169159  0.44890327
[66] -0.83249580  0.70958158 -1.34534931  0.01274972  1.42830143
[71] -0.83885188  0.12983414  0.27053949 -1.16241932  1.00845187
ted <- x[-ind]                    


ted
 [1] -0.07891709  0.11697127 -0.38885425  0.77340460  0.23094453
 [6]  1.75737562  1.06540233 -0.52228335  1.32223096 -1.87865588
[11] -1.73859795  0.98046414  1.82487242 -0.26199577 -0.37888356
[16]  0.63799424 -2.07440475  0.89682227 -0.04999577 -1.93121153
[21]  0.21636787 -0.10377029 -1.15757124 -0.53029645  2.44568276

3. Original Target 클래스의 비율에 맞게 분할

3-1. R function “createDataPartition”

createDataPartition(y, p = 0.5, list = TRUE, ...)

3-2. 예제

사용될 예제 데이터는 “Universal Bank_Main”로 유니버셜 은행의 고객들에 대한 데이터(출처 : Data Mining for Business Intelligence, Shmueli et al. 2010)이다. 데이터는 총 2500개이며, 변수의 갯수는 13개이다. 여기서 TargetPerson.Loan이다.



3-2-1. 데이터 불러오기

pacman::p_load("data.table", "dplyr",                               # Data processing
               "caret")                                             # For createDataPartition


UB   <- fread(paste(getwd(), "Universal Bank_Main.csv", sep="/"))   # Load Data


glimpse(UB)                                                          # Structure of data
Rows: 2,500
Columns: 14
$ ID                   <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, ~
$ Age                  <int> 25, 45, 39, 35, 35, 37, 53, 50, 35, 34,~
$ Experience           <int> 1, 19, 15, 9, 8, 13, 27, 24, 10, 9, 39,~
$ Income               <int> 49, 34, 11, 100, 45, 29, 72, 22, 81, 18~
$ `ZIP Code`           <int> 91107, 90089, 94720, 94112, 91330, 9212~
$ Family               <int> 4, 3, 1, 1, 4, 4, 2, 1, 3, 1, 4, 3, 2, ~
$ CCAvg                <dbl> 1.6, 1.5, 1.0, 2.7, 1.0, 0.4, 1.5, 0.3,~
$ Education            <int> 1, 1, 1, 2, 2, 2, 2, 3, 2, 3, 3, 2, 3, ~
$ Mortgage             <int> 0, 0, 0, 0, 0, 155, 0, 0, 104, 0, 0, 0,~
$ `Personal Loan`      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ~
$ `Securities Account` <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ~
$ `CD Account`         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
$ Online               <int> 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, ~
$ CreditCard           <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, ~

3-2-2. Data 전처리

UB <- UB %>%
  data.frame()%>%                                                    # Convert into Data Frame
  mutate(Personal.Loan = ifelse(Personal.Loan==1,"yes","no") ) %>%   # If Personal Loan is 1, then "yes" otherwise "no"
  mutate_at(vars(Personal.Loan), as.factor) %>%                      # Convert Personal Loan into Factor 
  select(-1)                                                         # Remove first column

3-2-3. Original 클래스 비율

y   <- UB$Personal.Loan      # Target   
t.y <- table(y)              # Frequency of Target of Original Data 
t.y  
y
  no  yes 
2244  256 
p.y <- prop.table(t.y)       # Proportion of Target of Original Data 
p.y  
y
    no    yes 
0.8976 0.1024 

3-2-4. Original Target 클래스의 비율에 맞게 분할

set.seed(100)
train_set <- createDataPartition(y, p=0.8, list=T) # p=0.8 : Partition 80% into Tranning data

trd       <- UB[train_set$Resample1,]              # Traning Data


ted       <- UB[-train_set$Resample1,]             # Test Data

t.trd.y <- table(trd$Personal.Loan)                # Frequency of Target of Training Data    


t.trd.y 

  no  yes 
1796  205 
p.trd.y <- prop.table(t.trd.y)                     # Proportion of Target of Training Data 


p.trd.y      

       no       yes 
0.8975512 0.1024488 
detach(package:caret)


4. Unbalanced Target

4-1. upSample

R function “upSample”

upSample(x, y, ...)

예제

pacman::p_load("caret")     # For upSample and downSample

y  <- UB$Personal.Loan      # Target


xs <- UB %>%                # Prediction Variable     
  select(-Personal.Loan)

UB.up <- upSample(xs,y)     # upSample : 적은 쪽의 데이터를 중복 추출하여 균형을 맞춤NAtable(UB.up$Class)

  no  yes 
2244 2244 
# 불균형을 해결한 후 데이터 분할NAy <- UB.up$Class                                   # Target 


train_set <- createDataPartition(y, p=0.8, list=T) # Original Target 비율에 맞게 Training Data를 80% 추출


trd <- UB.up[train_set$Resample1,]                 # Training Data       


ted <- UB.up[-train_set$Resample1,]                # Test Data


t.trd.y <- table(trd$Class)                        
t.trd.y

  no  yes 
1796 1796 
p.trd.y <- prop.table(t.trd.y)
p.trd.y

 no yes 
0.5 0.5 

4-2. downSample

R function “downSample”

downSample(x, y, ...)

예제

y  <- UB$Personal.Loan      # Target


xs <- UB %>%                # 예측변수      
  select(-Personal.Loan)


UB.down <- downSample(xs,y) # downSample : 많은 쪽의 데이터를 적게 추출하여 균형을 맞춤NAtable(UB.down$Class)

 no yes 
256 256 
# 불균형을 해결한 후 데이터 분할NAy <- UB.down$Class                                 # Target


train_set <- createDataPartition(y, p=0.8, list=T) # Original Target 비율에 맞게 Training Data를 80% 추출



trd <- UB.down[train_set$Resample1,]               # Traning Data


ted <- UB.down[-train_set$Resample1,]              # Test Data

t.trd.y <- table(trd$Class)
t.trd.y

 no yes 
205 205 
p.trd.y <- prop.table(t.trd.y)
p.trd.y

 no yes 
0.5 0.5 
detach(package:caret)

4-3. SMOTE

R function “SMOTE”

SMOTE(form, data, k = 5, perc.over = 200, perc.under = 200, ...)

예제

pacman::p_load("DMwR")                            # DMwR for SMOTE

UB.SMOTE <- SMOTE(Personal.Loan~. , data=UB, k=5,
                  perc.over =300, perc.under=200) # SMOTE는 Target의 형태가 문자형만NAtable(UB.SMOTE$Personal.Loan) 

  no  yes 
1536 1024 

4-4. SMOTE + Tomek

R function “ubTomek”

ubTomek(x, y, ...)

예제

pacman::p_load("unbalanced")                            # For ubTomek

input.ST <- UB.SMOTE %>%
  select(-Personal.Loan)                                # Target을 제외한 예측변수들


output.ST <- ifelse(UB.SMOTE$Personal.Loan=="yes",1,0)   # Tomek은 예측변수와 Target의 형태가 수치형!NAset.seed(100)                                           # Tomek은 seed값 지정해줘야 동일한 결과를 얻음  NAUB.ST <- ubTomek(input.ST, output.ST)    
Instances removed 139 : 9.05 % of 0 class ; 5.43 % of training ; Time needed 0.02 
table(UB.ST$Y)

   0    1 
1397 1024 
detach(package:DMwR)
detach(package:unbalanced)

4-5. SMOTE와 SMOTE + Tomek 비교

# SMOTE + Tomek 에 대한 데이터 프레임NAUB.ST1 <- UB.ST$X %>%
  mutate(Personal.Loan=UB.ST$Y) %>%              # Personal.Loan 변수 추가mutate_at(vars(Personal.Loan), as.factor) %>%  # Personal.Loan 변수를 범주형 변수로 변환변환
  data.frame()                                   # data.frame 으로 변환
pacman::p_load("ggplot2", "gridExtra")

## Original data
p1 <- ggplot(UB, aes(x=Income, y=Age, color=Personal.Loan, shape=Personal.Loan))+
  geom_point(show.legend = FALSE) + 
  scale_shape_manual(values=c("N", "Y")) + 
  theme(axis.title = element_text(face = "bold", size = 12)) +
  labs(x="N 2244 : 256 Y", y= "Original Data") +
  theme_bw() 
  

## SMOTE
p2 <- ggplot(UB.SMOTE, aes(x=Income, y=Age, color=Personal.Loan, shape=Personal.Loan))+
  geom_point(show.legend = FALSE) + 
  scale_shape_manual(values=c("N", "Y")) +
  theme(axis.title = element_text(face = "bold", size = 12)) +
  labs(x="N 1536 : 1024 Y", y= "UB.SMOTE") +
  theme_bw()


## SMOTE + Tomek 
p3 <- ggplot(UB.ST1, aes(x=Income, y=Age, color=Personal.Loan, shape=Personal.Loan))+
  geom_point(show.legend = FALSE) + 
  scale_shape_manual(values=c("N", "Y")) + 
  theme(axis.title = element_text(face = "bold", size = 12)) + 
  labs(x="N 1397 : 1024 Y", y= "UB.ST") +
  theme_bw()

grid.arrange(p1, p2, p3, ncol=3)        # p1, p2, p3 그래프를 한꺼번에 보기 보기
detach(package:ggplot2)
detach(package:gridExtra)

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".