Comparison Analysis

Text Mining

Description for Comparison Analysis

Yeongeun Jeon , Jung In Seo
2023-10-20

ํŒจํ‚ค์ง€ ์„ค์น˜

pacman::p_load("readr",
               "dplyr", "tidyr",
               "stringr",
               "tidytext",
               "KoNLP",
               "ggplot2")

1. ๋‹จ์–ด ๋นˆ๋„ ๋น„๊ต


# ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

# 1. ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
raw_moon <- readLines(".../speech_moon.txt",
                      encoding = "UTF-8")
moon <- raw_moon %>%
  as_tibble() %>%                            # Tibble ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
  mutate(president = "moon")                 # ๋ณ€์ˆ˜ president ์ถ”๊ฐ€

moon
# A tibble: 117 ร— 2
   value                                                     president
   <chr>                                                     <chr>    
 1 "์ •๊ถŒ๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                                    moon     
 2 "  ์ •์น˜๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                                  moon     
 3 "  ์‹œ๋Œ€๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                                  moon     
 4 "  "                                                      moon     
 5 "  โ€˜๋ถˆ๋น„๋ถˆ๋ช…(ไธ้ฃ›ไธ้ณด)โ€™์ด๋ผ๋Š” ๊ณ ์‚ฌ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค. ๋‚จ์ชฝ ์–ธโ€ฆ   moon     
 6 ""                                                        moon     
 7 "๊ทธ ๋™์•ˆ ์ •์น˜์™€ ๊ฑฐ๋ฆฌ๋ฅผ ๋‘ฌ ์™”์Šต๋‹ˆ๋‹ค. ๊ทธ๋Ÿฌ๋‚˜ ์•”์šธํ•œ ์‹œ๋Œ€๊ฐ€โ€ฆ moon     
 8 ""                                                        moon     
 9 ""                                                        moon     
10 "โ€˜์šฐ๋ฆฌ๋‚˜๋ผ ๋Œ€ํ†ต๋ นโ€™์ด ๋˜๊ฒ ์Šต๋‹ˆ๋‹ค."                         moon     
# โ„น 107 more rows
# 2. ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
raw_park <- readLines(".../speech_park.txt", 
                      encoding = "UTF-8")

park <- raw_park %>%
  as_tibble() %>%                            # Tibble ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
  mutate(president = "park")                 # ๋ณ€์ˆ˜ president ์ถ”๊ฐ€

park
# A tibble: 96 ร— 2
   value                                                     president
   <chr>                                                     <chr>    
 1 "์กด๊ฒฝํ•˜๋Š” ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„! ์ €๋Š” ์˜ค๋Š˜, ๊ตญ๋ฏผ ํ•œ ๋ถ„ ํ•œ ๋ถ„์˜ ๊ฟˆโ€ฆ  park     
 2 ""                                                        park     
 3 "๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„! ์ €์˜ ์‚ถ์€ ๋Œ€ํ•œ๋ฏผ๊ตญ๊ณผ ํ•จ๊ป˜ ํ•ด์˜จ ์‹œ๊ฐ„์ด์—ˆ์Šตโ€ฆ  park     
 4 ""                                                        park     
 5 "์–ด๋จธ๋‹ˆ๊ฐ€ ํ‰ํƒ„์— ๋Œ์•„๊ฐ€์‹  ํ›„, ๊ฒฌ๋”œ ์ˆ˜ ์—†๋Š” ๊ณ ํ†ต๊ณผ ์–ด๋ ค์›€โ€ฆ park     
 6 ""                                                        park     
 7 "๊ทธ๋•Œ๋ถ€ํ„ฐ ์ œ ์‚ถ์€ ์™„์ „ํžˆ ๋‹ค๋ฅธ ๊ธธ์„ ๊ฐ€์•ผํ–ˆ์Šต๋‹ˆ๋‹ค. ๊ฐœ์ธ์˜ โ€ฆ park     
 8 ""                                                        park     
 9 "์•„๋ฒ„์ง€๋ฅผ ์žƒ๋Š” ๋˜ ๋‹ค๋ฅธ ๊ณ ํ†ต๊ณผ ์•„ํ””์„ ๊ฒช๊ณ , ์ €๋Š” ํ‰๋ฒ”ํ•œ โ€ฆ  park     
10 ""                                                        park     
# โ„น 86 more rows
# ๋‘ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์„ ํ•˜๋‚˜์˜ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๊ฒฐํ•ฉํ•˜๊ธฐ
bind_speeches <- bind_rows(moon, park) %>%   # ๋‘ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์„ ํ–‰์œผ๋กœ ๊ฒฐํ•ฉ
  select(president, value)                   # ๋ณ€์ˆ˜ president์™€ value ์„ ํƒ

head(bind_speeches)                          # ๋ฐ์ดํ„ฐ์…‹์˜ ์•ž๋ถ€๋ถ„ ์ถœ๋ ฅ
# A tibble: 6 ร— 2
  president value                                                  
  <chr>     <chr>                                                  
1 moon      "์ •๊ถŒ๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                                 
2 moon      "  ์ •์น˜๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                               
3 moon      "  ์‹œ๋Œ€๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                               
4 moon      "  "                                                   
5 moon      "  โ€˜๋ถˆ๋น„๋ถˆ๋ช…(ไธ้ฃ›ไธ้ณด)โ€™์ด๋ผ๋Š” ๊ณ ์‚ฌ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค. ๋‚จ์ชฝ ์–ธโ€ฆ
6 moon      ""                                                     
tail(bind_speeches)                          # ๋ฐ์ดํ„ฐ์…‹์˜ ๋’ท๋ถ€๋ถ„ ์ถœ๋ ฅ
# A tibble: 6 ร— 2
  president value                                                     
  <chr>     <chr>                                                     
1 park      "๊ตญ๋ฏผ๋“ค์ด ๊ฟˆ์œผ๋กœ๋งŒ ๊ฐ€์กŒ๋˜ ํ–‰๋ณตํ•œ ์‚ถ์„ ์‹ค์ œ๋กœ ์ด๋ฃฐ ์ˆ˜ ์žˆ๋„โ€ฆ
2 park      ""                                                        
3 park      "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค."                                             
4 park      ""                                                        
5 park      "2012๋…„ 7์›” 10์ผ"                                         
6 park      "์ƒˆ๋ˆ„๋ฆฌ๋‹น ์˜ˆ๋น„ํ›„๋ณด ๋ฐ•๊ทผํ˜œ"                                
# ์ „์ฒ˜๋ฆฌ
speeches <- bind_speeches %>%
  mutate(value = str_replace_all(value,      
                                 "[^๊ฐ€-ํžฃ]", # [^๊ฐ€-ํžฃ] : ํ•œ๊ธ€์„ ์ œ์™ธํ•œ ๋ชจ๋“  ๋ฌธ์ž๋ฅผ ์˜๋ฏธํ•˜๋Š” ์ •๊ทœ ํ‘œํ˜„์‹
                                 " "),       # ๊ณต๋ฐฑ์œผ๋กœ ๋ณ€๊ฒฝ
         value = str_squish(value))          # ์—ฐ์†๋œ ๊ณต๋ฐฑ ์ œ๊ฑฐ

speeches
# A tibble: 213 ร— 2
   president value                                                    
   <chr>     <chr>                                                    
 1 moon      "์ •๊ถŒ๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค"                                    
 2 moon      "์ •์น˜๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค"                                    
 3 moon      "์‹œ๋Œ€๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค"                                    
 4 moon      ""                                                       
 5 moon      "๋ถˆ๋น„๋ถˆ๋ช… ์ด๋ผ๋Š” ๊ณ ์‚ฌ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค ๋‚จ์ชฝ ์–ธ๋• ๋‚˜๋ญ‡๊ฐ€์ง€์— ์•‰โ€ฆ
 6 moon      ""                                                       
 7 moon      "๊ทธ ๋™์•ˆ ์ •์น˜์™€ ๊ฑฐ๋ฆฌ๋ฅผ ๋‘ฌ ์™”์Šต๋‹ˆ๋‹ค ๊ทธ๋Ÿฌ๋‚˜ ์•”์šธํ•œ ์‹œ๋Œ€๊ฐ€ โ€ฆ
 8 moon      ""                                                       
 9 moon      ""                                                       
10 moon      "์šฐ๋ฆฌ๋‚˜๋ผ ๋Œ€ํ†ต๋ น ์ด ๋˜๊ฒ ์Šต๋‹ˆ๋‹ค"                          
# โ„น 203 more rows
# ํ† ํฐํ™”
speeches <- speeches %>%
  unnest_tokens(input = value,               # ํ† ํฐํ™”๋ฅผ ์ˆ˜ํ–‰ํ•  ํ…์ŠคํŠธ๊ฐ€ ํฌํ•จ๋œ ๋ณ€์ˆ˜๋ช…
                output = word,               # ์ถœ๋ ฅ ๋ณ€์ˆ˜๋ช…
                token = extractNoun)         # ๋ช…์‚ฌ ๊ธฐ์ค€์œผ๋กœ ํ† ํฐํ™”

speeches
# A tibble: 2,997 ร— 2
   president word      
   <chr>     <chr>     
 1 moon      "์ •๊ถŒ๊ต์ฒด"
 2 moon      "ํ•˜๊ฒ ์Šต๋‹ˆ"
 3 moon      "์ •์น˜"    
 4 moon      "๊ต์ฒด"    
 5 moon      "ํ•˜๊ฒ ์Šต๋‹ˆ"
 6 moon      "์‹œ๋Œ€"    
 7 moon      "๊ต์ฒด"    
 8 moon      "ํ•˜๊ฒ ์Šต๋‹ˆ"
 9 moon      ""        
10 moon      "๋ถˆ๋น„๋ถˆ๋ช…"
# โ„น 2,987 more rows

1-1. ๋‹จ์–ด ๋นˆ๋„ ๊ณ„์‚ฐ

frequency <- speeches %>%                    # ์ „์ฒ˜๋ฆฌ & ํ† ํฐํ™”๋ฅผ ์ˆ˜ํ–‰ํ•œ ๊ฒฐ๊ณผ๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด "speeches"
  count(president, word) %>%                 # ์—ฐ์„ค๋ฌธ ๊ฐ๊ฐ์˜ ๋‹จ์–ด ๋นˆ๋„ ๊ณ„์‚ฐ
  filter(str_count(word) > 1)                # ๋‘ ๊ธ€์ž ์ด์ƒ์˜ ๋‹จ์–ด๋งŒ ์ถ”์ถœ -> ํ•œ ๊ธ€์ž๋กœ ๋œ ๋‹จ์–ด ์ œ๊ฑฐ

frequency                            
# A tibble: 1,131 ร— 3
   president word         n
   <chr>     <chr>    <int>
 1 moon      ๊ฐ€๋™         1
 2 moon      ๊ฐ€์‚ฌ         1
 3 moon      ๊ฐ€์Šด         2
 4 moon      ๊ฐ€์กฑ         1
 5 moon      ๊ฐ€์กฑ๊ตฌ์กฐ     1
 6 moon      ๊ฐ€์ง€         4
 7 moon      ๊ฐ€์น˜         3
 8 moon      ๊ฐ์ข…         1
 9 moon      ๊ฐ๋‹น         1
10 moon      ๊ฐ•๋ ฅ         3
# โ„น 1,121 more rows

1-2. ์ž์ฃผ ์‚ฌ์šฉํ•œ ๋‹จ์–ด ์ถ”์ถœ

top10 <- frequency %>%                       # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 1-1
  group_by(president) %>%                    # ๋ณ€์ˆ˜ president์— ๋Œ€ํ•ด ๊ทธ๋ฃนํ™” -> ๊ฐ๊ฐ์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ ์œ„ํ•ด ์ˆ˜ํ–‰
  slice_max(n,                               # ๋‹จ์–ด์˜ ๋นˆ๋„๊ฐ€ ์ž…๋ ฅ๋œ ๋ณ€์ˆ˜๋ช…
            n = 10)                          # ๋นˆ๋„๊ฐ€ ๊ฐ€์žฅ ๋†’์€ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ 

top10
# A tibble: 22 ร— 3
# Groups:   president [2]
   president word       n
   <chr>     <chr>  <int>
 1 moon      ๊ตญ๋ฏผ      21
 2 moon      ์ผ์ž๋ฆฌ    21
 3 moon      ๋‚˜๋ผ      19
 4 moon      ์šฐ๋ฆฌ      17
 5 moon      ๊ฒฝ์ œ      15
 6 moon      ์‚ฌํšŒ      14
 7 moon      ์„ฑ์žฅ      13
 8 moon      ๋Œ€ํ†ต๋ น    12
 9 moon      ์ •์น˜      12
10 moon      ํ•˜๊ฒŒ      12
# โ„น 12 more rows
# ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ์— ๋Œ€ํ•œ ๊ฒฐ๊ณผ๋งŒ ์ถœ๋ ฅ
top10 %>%
  filter(president == "moon")                
# A tibble: 10 ร— 3
# Groups:   president [1]
   president word       n
   <chr>     <chr>  <int>
 1 moon      ๊ตญ๋ฏผ      21
 2 moon      ์ผ์ž๋ฆฌ    21
 3 moon      ๋‚˜๋ผ      19
 4 moon      ์šฐ๋ฆฌ      17
 5 moon      ๊ฒฝ์ œ      15
 6 moon      ์‚ฌํšŒ      14
 7 moon      ์„ฑ์žฅ      13
 8 moon      ๋Œ€ํ†ต๋ น    12
 9 moon      ์ •์น˜      12
10 moon      ํ•˜๊ฒŒ      12
# ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ์— ๋Œ€ํ•œ ๊ฒฐ๊ณผ๋งŒ ์ถœ๋ ฅ
top10 %>%
  filter(president == "park")               
# A tibble: 12 ร— 3
# Groups:   president [1]
   president word       n
   <chr>     <chr>  <int>
 1 park      ๊ตญ๋ฏผ      72
 2 park      ํ–‰๋ณต      23
 3 park      ์—ฌ๋Ÿฌ๋ถ„    20
 4 park      ์ •๋ถ€      17
 5 park      ๊ฒฝ์ œ      15
 6 park      ์‹ ๋ขฐ      11
 7 park      ๊ตญ๊ฐ€      10
 8 park      ์šฐ๋ฆฌ      10
 9 park      ๊ต์œก       9
10 park      ์‚ฌ๋žŒ       9
11 park      ์‚ฌํšŒ       9
12 park      ์ผ์ž๋ฆฌ     9

Result! ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด 12๊ฐœ๊ฐ€ ์ถœ๋ ฅ๋˜์—ˆ๋‹ค. 10๊ฐœ๊ฐ€ ์•„๋‹Œ 12๊ฐœ์ธ ์ด์œ ๋Š” โ€œ๊ต์œกโ€, โ€œ์‚ฌ๋žŒโ€, โ€œ์‚ฌํšŒโ€, โ€œ์ผ์ž๋ฆฌโ€๊ฐ€ ๋˜‘๊ฐ™์ด 9๋ฒˆ์”ฉ ์‚ฌ์šฉ๋˜์–ด ๋นˆ๋„๊ฐ€ ๋™์ผํ•œ ๋‹จ์–ด๋ฅผ ๋ชจ๋‘ ์ถ”์ถœํ•˜์˜€๊ธฐ ๋•Œ๋ฌธ์ด๋‹ค. ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์›ํ•˜๋Š” ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ ์œ„ํ•ด์„œ๋Š” ์˜ต์…˜ with_ties = F๋ฅผ ์ง€์ •ํ•˜๋ฉด ๋œ๋‹ค.

# ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์›ํ•˜๋Š” ๊ฐœ์ˆ˜๋งŒํผ ๋‹จ์–ด ์ถ”์ถœ
top10 <- frequency %>%                       # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 1-1
  group_by(president) %>%                    # ๋ณ€์ˆ˜ president์— ๋Œ€ํ•ด ๊ทธ๋ฃนํ™” -> ๊ฐ๊ฐ์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ ์œ„ํ•ด ์ˆ˜ํ–‰
  slice_max(n,                               # ๋‹จ์–ด์˜ ๋นˆ๋„๊ฐ€ ์ž…๋ ฅ๋œ ๋ณ€์ˆ˜๋ช…
            n = 10,                          # ๋นˆ๋„๊ฐ€ ๊ฐ€์žฅ ๋†’์€ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ 
            with_ties = F)                   # ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์˜ต์…˜ n์— ์ง€์ •ํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด ์ถ”์ถœ

top10
# A tibble: 20 ร— 3
# Groups:   president [2]
   president word       n
   <chr>     <chr>  <int>
 1 moon      ๊ตญ๋ฏผ      21
 2 moon      ์ผ์ž๋ฆฌ    21
 3 moon      ๋‚˜๋ผ      19
 4 moon      ์šฐ๋ฆฌ      17
 5 moon      ๊ฒฝ์ œ      15
 6 moon      ์‚ฌํšŒ      14
 7 moon      ์„ฑ์žฅ      13
 8 moon      ๋Œ€ํ†ต๋ น    12
 9 moon      ์ •์น˜      12
10 moon      ํ•˜๊ฒŒ      12
11 park      ๊ตญ๋ฏผ      72
12 park      ํ–‰๋ณต      23
13 park      ์—ฌ๋Ÿฌ๋ถ„    20
14 park      ์ •๋ถ€      17
15 park      ๊ฒฝ์ œ      15
16 park      ์‹ ๋ขฐ      11
17 park      ๊ตญ๊ฐ€      10
18 park      ์šฐ๋ฆฌ      10
19 park      ๊ต์œก       9
20 park      ์‚ฌ๋žŒ       9
# ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ์— ๋Œ€ํ•œ ๊ฒฐ๊ณผ๋งŒ ์ถœ๋ ฅ
top10 %>%
  filter(president == "moon")                
# A tibble: 10 ร— 3
# Groups:   president [1]
   president word       n
   <chr>     <chr>  <int>
 1 moon      ๊ตญ๋ฏผ      21
 2 moon      ์ผ์ž๋ฆฌ    21
 3 moon      ๋‚˜๋ผ      19
 4 moon      ์šฐ๋ฆฌ      17
 5 moon      ๊ฒฝ์ œ      15
 6 moon      ์‚ฌํšŒ      14
 7 moon      ์„ฑ์žฅ      13
 8 moon      ๋Œ€ํ†ต๋ น    12
 9 moon      ์ •์น˜      12
10 moon      ํ•˜๊ฒŒ      12
# ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ์— ๋Œ€ํ•œ ๊ฒฐ๊ณผ๋งŒ ์ถœ๋ ฅ
top10 %>%
  filter(president == "park")               
# A tibble: 10 ร— 3
# Groups:   president [1]
   president word       n
   <chr>     <chr>  <int>
 1 park      ๊ตญ๋ฏผ      72
 2 park      ํ–‰๋ณต      23
 3 park      ์—ฌ๋Ÿฌ๋ถ„    20
 4 park      ์ •๋ถ€      17
 5 park      ๊ฒฝ์ œ      15
 6 park      ์‹ ๋ขฐ      11
 7 park      ๊ตญ๊ฐ€      10
 8 park      ์šฐ๋ฆฌ      10
 9 park      ๊ต์œก       9
10 park      ์‚ฌ๋žŒ       9

Result! ์˜ต์…˜ with_ties = F๋ฅผ ์ง€์ •ํ•˜์—ฌ ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด 10๊ฐœ๊ฐ€ ์ถœ๋ ฅ๋˜์—ˆ๋‹ค. ๋˜‘๊ฐ™์ด 9๋ฒˆ์”ฉ ์‚ฌ์šฉํ•œ ๋‹จ์–ด โ€œ๊ต์œกโ€, โ€œ์‚ฌ๋žŒโ€, โ€œ์‚ฌํšŒโ€, โ€œ์ผ์ž๋ฆฌโ€ ์ค‘ โ€œ๊ต์œกโ€๊ณผ โ€œ์‚ฌ๋žŒโ€์ด ํฌํ•จ๋œ ์ด์œ ๋Š” ๋นˆ๋„๊ฐ€ ๋™์ผํ•œ ๋‹จ์–ด์˜ ๊ฒฝ์šฐ ์›๋ณธ ๋ฐ์ดํ„ฐ์˜ ์ •๋ ฌ ์ˆœ์„œ์— ๋”ฐ๋ผ ์ถœ๋ ฅํ•˜๊ธฐ ๋•Œ๋ฌธ์ด๋‹ค.


1-3. ์‹œ๊ฐํ™”

# ๊ธฐ๋ณธ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
ggplot(top10,                               # ์ž์ฃผ ์‚ฌ์šฉํ•œ ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 1-2
       aes(x = reorder(word, n),            # reorder : top10์—์„œ ๋‹จ์–ด์— ๋”ฐ๋ฅธ ํ‰๊ท  ์‚ฌ์šฉ ๋นˆ๋„๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
           y = n,                           
           fill = president)) +             # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น” ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                              # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip() +                            # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „ 
  facet_wrap(~president)                    # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋‘ ์ „ ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ

Result! ๊ทธ๋ž˜ํ”„๋ฅผ ๋ณด๋ฉด ๋ง‰๋Œ€๊ฐ€ ์—†๋Š” ๋‹จ์–ด๊ฐ€ ์กด์žฌํ•œ๋‹ค. ์ด๋Š” ์ถ•์„ ๊ตฌ์„ฑํ•˜๋Š” ๋‹จ์–ด๊ฐ€ ํ•œ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—๋งŒ ํฌํ•จ๋˜์–ด ์žˆ๊ธฐ ๋•Œ๋ฌธ์ด๋‹ค. ์˜ˆ๋ฅผ ๋“ค์–ด, โ€œํ–‰๋ณตโ€์€ ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—๋Š” ์กด์žฌํ•˜์ง€๋งŒ ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—๋Š” ์กด์žฌํ•˜์ง€ ์•Š๋Š”๋‹ค. ์ด๋Ÿฌํ•œ ๋ฌธ์ œ๋ฅผ ๋ฐฉ์ง€ํ•˜๊ธฐ ์œ„ํ•ด ํ•จ์ˆ˜ facet_wrap์˜ ์˜ต์…˜ scales = "free_y"์„ ์ง€์ •ํ•œ๋‹ค.

# y์ถ•์ด ๋‹ค๋ฅธ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
ggplot(top10,                               # ์ž์ฃผ ์‚ฌ์šฉํ•œ ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 1-2
       aes(x = reorder(word, n),            # reorder : top10์—์„œ ๋‹จ์–ด์— ๋”ฐ๋ฅธ ํ‰๊ท  ์‚ฌ์šฉ ๋นˆ๋„๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
           y = n,                           
           fill = president)) +             # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น” ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                              # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip() +                            # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „ 
  facet_wrap(~president,                    # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋‘ ์ „ ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
             scales = "free_y")             # y์ถ• ํ†ต์ผ X

Result! ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„๋ฅผ ๋ณด๋ฉด, โ€œ๊ตญ๋ฏผโ€์˜ ๋นˆ๋„๊ฐ€ ๋„ˆ๋ฌด ๋†’์•„ ๋‹ค๋ฅธ ๋‹จ์–ด์˜ ๋นˆ๋„ ์ฐจ์ด๊ฐ€ ์ž˜ ๋“œ๋Ÿฌ๋‚˜์ง€ ์•Š๋Š”๋‹ค.

# ์ „๋ฐ˜์ ์ธ ๋‹จ์–ด์˜ ๋นˆ๋„๊ฐ€ ์ž˜ ๋“œ๋Ÿฌ๋‚˜๋„๋ก "๊ตญ๋ฏผ" ์ œ์™ธ
top10 <- frequency %>%                      # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 1-1
  filter(word != "๊ตญ๋ฏผ") %>%                # "๊ตญ๋ฏผ" ์ œ์™ธ
  group_by(president) %>%                   # ๋ณ€์ˆ˜ president์— ๋Œ€ํ•ด ๊ทธ๋ฃนํ™” -> ๊ฐ๊ฐ์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ ์œ„ํ•ด ์ˆ˜ํ–‰
  slice_max(n,                              # ๋‹จ์–ด์˜ ๋นˆ๋„๊ฐ€ ์ž…๋ ฅ๋œ ๋ณ€์ˆ˜๋ช…
            n = 10,                         # ๋นˆ๋„๊ฐ€ ๊ฐ€์žฅ ๋†’์€ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ 
            with_ties = F)                  # ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์˜ต์…˜ n์— ์ง€์ •ํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด ์ถ”์ถœ

top10
# A tibble: 20 ร— 3
# Groups:   president [2]
   president word         n
   <chr>     <chr>    <int>
 1 moon      ์ผ์ž๋ฆฌ      21
 2 moon      ๋‚˜๋ผ        19
 3 moon      ์šฐ๋ฆฌ        17
 4 moon      ๊ฒฝ์ œ        15
 5 moon      ์‚ฌํšŒ        14
 6 moon      ์„ฑ์žฅ        13
 7 moon      ๋Œ€ํ†ต๋ น      12
 8 moon      ์ •์น˜        12
 9 moon      ํ•˜๊ฒŒ        12
10 moon      ๋Œ€ํ•œ๋ฏผ๊ตญ    11
11 park      ํ–‰๋ณต        23
12 park      ์—ฌ๋Ÿฌ๋ถ„      20
13 park      ์ •๋ถ€        17
14 park      ๊ฒฝ์ œ        15
15 park      ์‹ ๋ขฐ        11
16 park      ๊ตญ๊ฐ€        10
17 park      ์šฐ๋ฆฌ        10
18 park      ๊ต์œก         9
19 park      ์‚ฌ๋žŒ         9
20 park      ์‚ฌํšŒ         9
ggplot(top10,                               # "๊ตญ๋ฏผ" ์ œ์™ธํ•˜๊ณ  ์ž์ฃผ ์‚ฌ์šฉํ•œ ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด
       aes(x = reorder(word, n),            # reorder : top10์—์„œ ๋‹จ์–ด์— ๋”ฐ๋ฅธ ํ‰๊ท  ์‚ฌ์šฉ ๋นˆ๋„๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
           y = n,                           
           fill = president)) +             # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น” ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                              # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip() +                            # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „ 
  facet_wrap(~president,                    # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋‘ ์ „ ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
             scales = "free_y")             # y์ถ• ํ†ต์ผ X

Result! ๊ทธ๋ž˜ํ”„๋ฅผ ๋ณด๋ฉด x์ถ•์„ ์ง€์ •ํ•  ๋•Œ ํ•จ์ˆ˜ reorder๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ๋„ ๋ง‰๋Œ€๊ฐ€ ๋นˆ๋„ ๊ธฐ์ค€์œผ๋กœ ์™„๋ฒฝํ•˜๊ฒŒ ์ •๋ ฌ๋˜์ง€ ์•Š์•˜๋‹ค. ์ด๋Š” ๊ทธ๋ž˜ํ”„๋ฅผ ์ž‘์„ฑํ•  ๋•Œ ๊ฐ์ฒด โ€œtop10โ€์—์„œ ๋‹จ์–ด์— ๋”ฐ๋ฅธ ํ‰๊ท  ์‚ฌ์šฉ ๋นˆ๋„๋ฅผ ๊ธฐ์ค€์œผ๋กœ x์ถ• ์ˆœ์„œ๋ฅผ ์ •ํ–ˆ๊ธฐ ๋•Œ๋ฌธ์ด๋‹ค. ์ด๋Ÿฌํ•œ ๋ฌธ์ œ๋ฅผ ๋ฐฉ์ง€ํ•˜๊ธฐ ์œ„ํ•ด Package "tidytext"์˜ ํ•จ์ˆ˜ reorder_within(x, by, within)๋ฅผ ์‚ฌ์šฉํ•œ๋‹ค.

# ํ•ญ๋ชฉ๋ณ„๋กœ ๋‹จ์–ด ๋นˆ๋„๋ฅผ ์ •๋ ฌํ•œ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
ggplot(top10,                                       # "๊ตญ๋ฏผ" ์ œ์™ธํ•˜๊ณ  ์ž์ฃผ ์‚ฌ์šฉํ•œ ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด
       aes(x = reorder_within(word, n, president),  # reorder_within : ํ•ญ๋ชฉ๋ณ„๋กœ ๋‹จ์–ด ๋นˆ๋„์ˆœ ์ •๋ ฌ
           y = n,                           
           fill = president)) +                     # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น” ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                                      # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip() +                                    # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „ 
  facet_wrap(~president,                            # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋‘ ์ „ ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
             scales = "free_y")                     # y์ถ• ํ†ต์ผ X

# ๋‹จ์–ด ๋’ค์— ํ•ญ๋ชฉ ์ด๋ฆ„์„ ์ œ๊ฑฐํ•œ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ 
ggplot(top10,                                       # "๊ตญ๋ฏผ" ์ œ์™ธํ•˜๊ณ  ์ž์ฃผ ์‚ฌ์šฉํ•œ ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด
       aes(x = reorder_within(word, n, president),  # reorder_within : ํ•ญ๋ชฉ๋ณ„๋กœ ๋‹จ์–ด ๋นˆ๋„์ˆœ ์ •๋ ฌ
           y = n,                           
           fill = president)) +                     # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น” ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                                      # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip() +                                    # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „ 
  facet_wrap(~president,                            # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋‘ ์ „ ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
             scales = "free_y") +                   # y์ถ• ํ†ต์ผ X
  scale_x_reordered()                               # ๋‹จ์–ด ๋’ค์˜ ๋Œ€ํ†ต๋ น ์ด๋ฆ„ ์ œ๊ฑฐ


2. ์˜ค์ฆˆ๋น„


2-1. ๋ฐ์ดํ„ฐ ํ˜•ํƒœ ๋ณ€ํ™˜

์ถœ์ฒ˜ : https://tavareshugo.github.io/r-intro-tidyverse-gapminder/09-reshaping/index.html


# Long Form Dataset
df_long <- frequency %>%                            # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 1-1
  group_by(president) %>%                           # ๋ณ€์ˆ˜ president์— ๋Œ€ํ•ด ๊ทธ๋ฃนํ™” -> ๊ฐ๊ฐ์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ ์œ„ํ•ด ์ˆ˜ํ–‰
  slice_max(n,                                      # ๋‹จ์–ด์˜ ๋นˆ๋„๊ฐ€ ์ž…๋ ฅ๋œ ๋ณ€์ˆ˜๋ช…
            n = 10) %>%                             # ๋นˆ๋„๊ฐ€ ๊ฐ€์žฅ ๋†’์€ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ    
  filter(word %in% c("๊ตญ๋ฏผ", "์šฐ๋ฆฌ",                # "๊ตญ๋ฏผ", "์šฐ๋ฆฌ", "์ •์น˜", "ํ–‰๋ณต" ๋‹จ์–ด๋งŒ ์ถ”์ถœ 
                     "์ •์น˜", "ํ–‰๋ณต"))

df_long
# A tibble: 6 ร— 3
# Groups:   president [2]
  president word      n
  <chr>     <chr> <int>
1 moon      ๊ตญ๋ฏผ     21
2 moon      ์šฐ๋ฆฌ     17
3 moon      ์ •์น˜     12
4 park      ๊ตญ๋ฏผ     72
5 park      ํ–‰๋ณต     23
6 park      ์šฐ๋ฆฌ     10

Result! ์˜ˆ๋ฅผ ์œ„ํ•ด โ€œ๊ตญ๋ฏผโ€, โ€œ์šฐ๋ฆฌโ€, โ€œ์ •์น˜โ€, โ€œํ–‰๋ณตโ€ ๋‹จ์–ด๋งŒ ์ถ”์ถœํ•˜์—ฌ Long Form ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๋‚˜ํƒ€๋‚ด์—ˆ๋‹ค. โ€œ๊ตญ๋ฏผโ€์€ ์ฒซ ๋ฒˆ์งธ ํ–‰์„ ํ†ตํ•ด ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ 21๋ฒˆ ์‚ฌ์šฉํ•˜์˜€์œผ๋ฉฐ, ๋„ค ๋ฒˆ์งธ ํ–‰์„ ํ†ตํ•ด ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ 72๋ฒˆ ์‚ฌ์šฉํ•˜์˜€์Œ์„ ์•Œ ์ˆ˜ ์žˆ๋‹ค. โ€œ์šฐ๋ฆฌโ€๋Š” ๋‘ ๋ฒˆ์งธ ํ–‰๊ณผ ์—ฌ์„ฏ ๋ฒˆ์งธ ํ–‰์„ ํ†ตํ•ด ๊ฐ๊ฐ์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋ช‡ ๋ฒˆ ์‚ฌ์šฉํ•˜์˜€๋Š”์ง€ ์•Œ ์ˆ˜ ์žˆ๋‹ค. ์ด์ฒ˜๋Ÿผ Long Form ๋ฐ์ดํ„ฐ์…‹์€ ๊ฐ™์€ ๋‹จ์–ด๊ฐ€ ํ•ญ๋ชฉ๋ณ„๋กœ ๋‹ค๋ฅธ ํ–‰์„ ๊ตฌ์„ฑํ•˜๊ธฐ ๋•Œ๋ฌธ์— ๊ฐ ํ•ญ๋ชฉ์—์„œ ํ•ด๋‹น ๋‹จ์–ด๋ฅผ ๋ช‡ ๋ฒˆ์”ฉ ์‚ฌ์šฉํ•˜์˜€๋Š”์ง€ ํ•œ ๋ฒˆ์— ๋น„๊ตํ•˜๊ธฐ ์–ด๋ ต๊ณ , ๋‹จ์–ด ๋นˆ๋„๋ฅผ ํ™œ์šฉํ•ด ์—ฐ์‚ฐํ•˜๊ธฐ๋„ ๋ถˆํŽธํ•˜๋‹ค.

# Wide Form Dataset์œผ๋กœ ๋ณ€ํ™˜
df_wide <- df_long %>%                              # Long Form Dataset
  pivot_wider(names_from = president,               # ๋ณ€์ˆ˜๋ช…์œผ๋กœ ์ž…๋ ฅํ•  ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              values_from = n)                      # ๋ณ€์ˆ˜์— ์ฑ„์›Œ ๋„ฃ์„ ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜

df_wide
# A tibble: 4 ร— 3
  word   moon  park
  <chr> <int> <int>
1 ๊ตญ๋ฏผ     21    72
2 ์šฐ๋ฆฌ     17    10
3 ์ •์น˜     12    NA
4 ํ–‰๋ณต     NA    23

Result! Wide Form ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๋ณ€ํ™˜ํ•˜๊ธฐ ์œ„ํ•ด Package "tidyr"์—์„œ ์ œ๊ณตํ•˜๋Š” ํ•จ์ˆ˜ pivot_wider๋ฅผ ์‚ฌ์šฉํ•œ๋‹ค. Wide Form ๋ฐ์ดํ„ฐ์…‹์€ ํ•œ ๋‹จ์–ด๊ฐ€ ํ•œ ํ–‰์œผ๋กœ ๊ตฌ์„ฑ๋˜์–ด์žˆ์œผ๋ฉฐ, ๋‘ ์—ฐ์„ค๋ฌธ์—์„œ โ€œ๊ตญ๋ฏผโ€, โ€œ์šฐ๋ฆฌโ€, โ€œ์ •์น˜โ€, โ€œํ–‰๋ณตโ€์„ ๋ช‡ ๋ฒˆ์”ฉ ์‚ฌ์šฉํ•˜์˜€๋Š”์ง€ ์‰ฝ๊ฒŒ ๋น„๊ตํ•  ์ˆ˜ ์žˆ๋‹ค.


Caution! ์•ž์—์„œ ์‹คํ–‰ํ•œ ์ฝ”๋“œ์˜ ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด, ๊ฒฐ์ธก์น˜ NA๊ฐ€ ์กด์žฌํ•œ๋‹ค. ์–ด๋–ค ๋‹จ์–ด๊ฐ€ ๋‘ ์—ฐ์„ค๋ฌธ ์ค‘ ํ•œ ์—ฐ์„ค๋ฌธ์— ์กด์žฌํ•˜์ง€ ์•Š์œผ๋ฉด ๋‹จ์–ด์˜ ๋นˆ๋„๊ฐ€ ๊ณ„์‚ฐ๋˜์ง€ ์•Š์œผ๋ฏ€๋กœ ๋ณ€์ˆ˜ n์˜ ๊ฐ’์ด ์—†์–ด์„œ NA๊ฐ€ ๋œ๋‹ค. ๊ฒฐ์ธก์น˜ NA๋Š” ์—ฐ์‚ฐํ•  ์ˆ˜ ์—†์œผ๋ฏ€๋กœ 0์œผ๋กœ ๋ณ€ํ™˜ํ•ด์•ผ ํ•œ๋‹ค. ์ด๋Ÿฌํ•œ ์ž‘์—…์€ ํ•จ์ˆ˜ pivot_wider์˜ ์˜ต์…˜ values_fill = list(n = 0)์„ ์ง€์ •ํ•˜๋ฉด ๋œ๋‹ค.

df_wide <- df_long %>%                              # Long Form Dataset
  pivot_wider(names_from = president,               # ๋ณ€์ˆ˜๋ช…์œผ๋กœ ์ž…๋ ฅํ•  ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              values_from = n,                      # ๋ณ€์ˆ˜์— ์ฑ„์›Œ ๋„ฃ์„ ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              values_fill = list(n = 0))            # ๊ฒฐ์ธก์น˜ NA๋ฅผ 0์œผ๋กœ ๋Œ€์ฒด

df_wide
# A tibble: 4 ร— 3
  word   moon  park
  <chr> <int> <int>
1 ๊ตญ๋ฏผ     21    72
2 ์šฐ๋ฆฌ     17    10
3 ์ •์น˜     12     0
4 ํ–‰๋ณต      0    23

# ๋‘ ์—ฐ์„ค๋ฌธ์—์„œ ์‚ฌ์šฉํ•œ ๋ชจ๋“  ๋‹จ์–ด์˜ ๋นˆ๋„๋ฅผ ์ด์šฉํ•˜์—ฌ Wide Form ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
frequency_wide <- frequency %>%                     # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 1-1
  pivot_wider(names_from = president,               # ๋ณ€์ˆ˜๋ช…์œผ๋กœ ์ž…๋ ฅํ•  ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              values_from = n,                      # ๋ณ€์ˆ˜์— ์ฑ„์›Œ ๋„ฃ์„ ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              values_fill = list(n = 0))            # ๊ฒฐ์ธก์น˜ NA๋ฅผ 0์œผ๋กœ ๋Œ€์ฒด

frequency_wide
# A tibble: 955 ร— 3
   word      moon  park
   <chr>    <int> <int>
 1 ๊ฐ€๋™         1     0
 2 ๊ฐ€์‚ฌ         1     0
 3 ๊ฐ€์Šด         2     0
 4 ๊ฐ€์กฑ         1     1
 5 ๊ฐ€์กฑ๊ตฌ์กฐ     1     0
 6 ๊ฐ€์ง€         4     0
 7 ๊ฐ€์น˜         3     1
 8 ๊ฐ์ข…         1     0
 9 ๊ฐ๋‹น         1     0
10 ๊ฐ•๋ ฅ         3     0
# โ„น 945 more rows

Result! ๋‘ ์—ฐ์„ค๋ฌธ์—์„œ ์‚ฌ์šฉํ•œ ๋ชจ๋“  ๋‹จ์–ด์— ๋Œ€ํ•œ Wide Form ๋ฐ์ดํ„ฐ์…‹์„ ์ƒ์„ฑํ•˜์˜€๋‹ค.


2-2. ์˜ค์ฆˆ๋น„ ๊ณ„์‚ฐ

\[ \begin{align*} \text{Odds ratio} = \frac{\left( \frac{n+1}{\text{total}+1} \right)_{\text{Text A}} }{ \left( \frac{n+1}{\text{total}+1} \right)_{\text{Text B}} } \end{align*} \]


2-2-1. ๋‹จ์–ด์˜ ๋น„์ค‘์„ ๋‚˜ํƒ€๋‚ด๋Š” ๋ณ€์ˆ˜ ์ถ”๊ฐ€

\[ \begin{align*} \text{ํ•ด๋‹น ๋‹จ์–ด์˜ ๋น„์ค‘} = \frac{\text{ํ•ด๋‹น ๋‹จ์–ด์˜ ๋นˆ๋„} }{ \text{๋ชจ๋“  ๋‹จ์–ด์˜ ๋นˆ๋„ ํ•ฉ} } \end{align*} \]

odds_df <- frequency_wide %>%                        # Wide Form Dataset in 2-1
  mutate(ratio_moon = ((moon + 1)/(sum(moon + 1))),  # ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋‹จ์–ด์˜ ๋น„์ค‘ ๊ณ„์‚ฐ
         ratio_park = ((park + 1)/(sum(park + 1))))  # ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋‹จ์–ด์˜ ๋น„์ค‘ ๊ณ„์‚ฐ

odds_df
# A tibble: 955 ร— 5
   word      moon  park ratio_moon ratio_park
   <chr>    <int> <int>      <dbl>      <dbl>
 1 ๊ฐ€๋™         1     0   0.000873   0.000552
 2 ๊ฐ€์‚ฌ         1     0   0.000873   0.000552
 3 ๊ฐ€์Šด         2     0   0.00131    0.000552
 4 ๊ฐ€์กฑ         1     1   0.000873   0.00110 
 5 ๊ฐ€์กฑ๊ตฌ์กฐ     1     0   0.000873   0.000552
 6 ๊ฐ€์ง€         4     0   0.00218    0.000552
 7 ๊ฐ€์น˜         3     1   0.00175    0.00110 
 8 ๊ฐ์ข…         1     0   0.000873   0.000552
 9 ๊ฐ๋‹น         1     0   0.000873   0.000552
10 ๊ฐ•๋ ฅ         3     0   0.00175    0.000552
# โ„น 945 more rows

Result! โ€œ๊ฐ€๋™โ€์€ ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์‚ฌ์šฉ๋˜์ง€ ์•Š์•„ ๋นˆ๋„๊ฐ€ 0์ด์ง€๋งŒ ๊ฐ’์— 1์„ ๋”ํ•˜์—ฌ ๊ณ„์‚ฐํ•˜๋ฏ€๋กœ์จ ๋‹จ์–ด์˜ ๋น„์ค‘์ด 0.000552๋กœ ๊ณ„์‚ฐ๋˜์—ˆ๋‹ค.


2-2-2. ์˜ค์ฆˆ๋น„ ๋ณ€์ˆ˜ ์ถ”๊ฐ€

odds_df <- odds_df %>%
  mutate(odds_ratio = ratio_moon/ratio_park)         # ์˜ค์ฆˆ๋น„ ๊ณ„์‚ฐ / ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์— ๋น„ํ•ด ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์–ผ๋งˆ๋‚˜ ๋น„์ค‘์ด ๋” ํฐ์ง€๋ฅผ ๋‚˜ํƒ€๋ƒ„
# ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์  ๋น„์ค‘์ด ํฐ ๋‹จ์–ด
odds_df %>%
  arrange(desc(odds_ratio))                          # ์˜ค์ฆˆ๋น„๋ฅผ ๋‚ด๋ฆผ์ฐจ์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 955 ร— 6
   word      moon  park ratio_moon ratio_park odds_ratio
   <chr>    <int> <int>      <dbl>      <dbl>      <dbl>
 1 ๋ณต์ง€๊ตญ๊ฐ€     8     0    0.00393   0.000552       7.12
 2 ์„ธ์ƒ         6     0    0.00306   0.000552       5.54
 3 ์—ฌ์„ฑ         6     0    0.00306   0.000552       5.54
 4 ์ •์˜         6     0    0.00306   0.000552       5.54
 5 ๊ฐ•์ž         5     0    0.00262   0.000552       4.75
 6 ๊ณตํ‰         5     0    0.00262   0.000552       4.75
 7 ๋Œ€ํ†ต๋ น์˜     5     0    0.00262   0.000552       4.75
 8 ๋ณดํ†ต         5     0    0.00262   0.000552       4.75
 9 ์ƒ์ƒ         5     0    0.00262   0.000552       4.75
10 ์ง€๋ฐฉ         5     0    0.00262   0.000552       4.75
# โ„น 945 more rows

Result! ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์ ์œผ๋กœ ๋งŽ์ด ์‚ฌ์šฉํ•œ ๋‹จ์–ด์ผ์ˆ˜๋ก ์˜ค์ฆˆ๋น„๊ฐ€ ํฌ๋‹ค. ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด, โ€œ๋ณต์ง€๊ตญ๊ฐ€โ€๊ฐ€ 7.12๋กœ ์˜ค์ฆˆ๋น„๊ฐ€ ์ œ์ผ ํฌ๋ฉฐ, ์ด๋Š” ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์— ๋น„ํ•ด ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์ ์œผ๋กœ ๋งŽ์ด ์‚ฌ์šฉํ–ˆ๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•œ๋‹ค.

# ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์  ๋น„์ค‘์ด ํฐ ๋‹จ์–ด
odds_df %>%
  arrange(odds_ratio)                                # ์˜ค์ฆˆ๋น„๋ฅผ ์˜ค๋ฆ„์ฐจ์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 955 ร— 6
   word      moon  park ratio_moon ratio_park odds_ratio
   <chr>    <int> <int>      <dbl>      <dbl>      <dbl>
 1 ๋ฐ•๊ทผํ˜œ       0     8   0.000436    0.00496     0.0879
 2 ์—ฌ๋Ÿฌ๋ถ„       2    20   0.00131     0.0116      0.113 
 3 ํ–‰๋ณต         3    23   0.00175     0.0132      0.132 
 4 ์‹ค์ฒœ         0     5   0.000436    0.00331     0.132 
 5 ์ •๋ณด         0     5   0.000436    0.00331     0.132 
 6 ํˆฌ๋ช…         0     5   0.000436    0.00331     0.132 
 7 ๊ณผ์ œ         0     4   0.000436    0.00276     0.158 
 8 ๊ตญ์ •์šด์˜     0     4   0.000436    0.00276     0.158 
 9 ์‹œ์ž‘         0     4   0.000436    0.00276     0.158 
10 ์ง€์‹         0     4   0.000436    0.00276     0.158 
# โ„น 945 more rows

Result! ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์ ์œผ๋กœ ๋งŽ์ด ์‚ฌ์šฉํ•œ ๋‹จ์–ด์ผ์ˆ˜๋ก ์˜ค์ฆˆ๋น„๊ฐ€ ์ž‘๋‹ค. ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด, โ€œ๋ฐ•๊ทผํ˜œโ€๊ฐ€ 0.0879๋กœ ์˜ค์ฆˆ๋น„๊ฐ€ ์ œ์ผ ์ž‘์œผ๋ฉฐ, ์ด๋Š” ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์— ๋น„ํ•ด ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์ ์œผ๋กœ ๋งŽ์ด ์‚ฌ์šฉํ–ˆ๋‹ค๋Š” ๊ฒƒ์„ ์˜๋ฏธํ•œ๋‹ค.

## ์˜ค์ฆˆ๋น„ ๋ณ€์ˆ˜ ํ•œ ๋ฒˆ์— ์ถ”๊ฐ€
odds_df <- frequency_wide %>%                        # Wide Form Dataset in 2-1
  mutate(ratio_moon = ((moon + 1)/(sum(moon + 1))),  # ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋‹จ์–ด์˜ ๋น„์ค‘ ๊ณ„์‚ฐ
         ratio_park = ((park + 1)/(sum(park + 1))),  # ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋‹จ์–ด์˜ ๋น„์ค‘ ๊ณ„์‚ฐ
         odds_ratio = ratio_moon/ratio_park)         # ์˜ค์ฆˆ๋น„ ๊ณ„์‚ฐ / ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์— ๋น„ํ•ด ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์–ผ๋งˆ๋‚˜ ๋น„์ค‘์ด ๋” ํฐ์ง€๋ฅผ ๋‚˜ํƒ€๋ƒ„

odds_df
# A tibble: 955 ร— 6
   word      moon  park ratio_moon ratio_park odds_ratio
   <chr>    <int> <int>      <dbl>      <dbl>      <dbl>
 1 ๊ฐ€๋™         1     0   0.000873   0.000552      1.58 
 2 ๊ฐ€์‚ฌ         1     0   0.000873   0.000552      1.58 
 3 ๊ฐ€์Šด         2     0   0.00131    0.000552      2.37 
 4 ๊ฐ€์กฑ         1     1   0.000873   0.00110       0.791
 5 ๊ฐ€์กฑ๊ตฌ์กฐ     1     0   0.000873   0.000552      1.58 
 6 ๊ฐ€์ง€         4     0   0.00218    0.000552      3.96 
 7 ๊ฐ€์น˜         3     1   0.00175    0.00110       1.58 
 8 ๊ฐ์ข…         1     0   0.000873   0.000552      1.58 
 9 ๊ฐ๋‹น         1     0   0.000873   0.000552      1.58 
10 ๊ฐ•๋ ฅ         3     0   0.00175    0.000552      3.17 
# โ„น 945 more rows

2-3. ์ƒ๋Œ€์ ์œผ๋กœ ์ค‘์š”ํ•œ ๋‹จ์–ด ์ถ”์ถœ

top10 <- odds_df %>%                                 # ์˜ค์ฆˆ๋น„๋ฅผ ๊ณ„์‚ฐํ•œ ๊ฒฐ๊ณผ๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 2-2-2
  filter(rank(odds_ratio) <= 10 |                    # ์˜ค์ฆˆ๋น„๊ฐ€ ๋‚ฎ์€ ํ•˜์œ„ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ
           rank(-odds_ratio) <= 10)                  # ์˜ค์ฆˆ๋น„๊ฐ€ ๋†’์€ ์ƒ์œ„ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ

top10 %>%
  arrange(desc(odds_ratio)) %>%                      # ์˜ค์ฆˆ๋น„๋ฅผ ๋‚ด๋ฆผ์ฐจ์ˆœ์œผ๋กœ ์ •๋ ฌ
  print(n = Inf)                                     # ๋ชจ๋“  ํ–‰ ์ถœ๋ ฅ
# A tibble: 20 ร— 6
   word      moon  park ratio_moon ratio_park odds_ratio
   <chr>    <int> <int>      <dbl>      <dbl>      <dbl>
 1 ๋ณต์ง€๊ตญ๊ฐ€     8     0   0.00393    0.000552     7.12  
 2 ์„ธ์ƒ         6     0   0.00306    0.000552     5.54  
 3 ์—ฌ์„ฑ         6     0   0.00306    0.000552     5.54  
 4 ์ •์˜         6     0   0.00306    0.000552     5.54  
 5 ๊ฐ•์ž         5     0   0.00262    0.000552     4.75  
 6 ๊ณตํ‰         5     0   0.00262    0.000552     4.75  
 7 ๋Œ€ํ†ต๋ น์˜     5     0   0.00262    0.000552     4.75  
 8 ๋ณดํ†ต         5     0   0.00262    0.000552     4.75  
 9 ์ƒ์ƒ         5     0   0.00262    0.000552     4.75  
10 ์ง€๋ฐฉ         5     0   0.00262    0.000552     4.75  
11 ๊ณผ์ œ         0     4   0.000436   0.00276      0.158 
12 ๊ตญ์ •์šด์˜     0     4   0.000436   0.00276      0.158 
13 ์‹œ์ž‘         0     4   0.000436   0.00276      0.158 
14 ์ง€์‹         0     4   0.000436   0.00276      0.158 
15 ํ–‰๋ณต         3    23   0.00175    0.0132       0.132 
16 ์‹ค์ฒœ         0     5   0.000436   0.00331      0.132 
17 ์ •๋ณด         0     5   0.000436   0.00331      0.132 
18 ํˆฌ๋ช…         0     5   0.000436   0.00331      0.132 
19 ์—ฌ๋Ÿฌ๋ถ„       2    20   0.00131    0.0116       0.113 
20 ๋ฐ•๊ทผํ˜œ       0     8   0.000436   0.00496      0.0879

Result! ํ•จ์ˆ˜ filter๋Š” ์กฐ๊ฑด์— ๋งž๋Š” ํ–‰๋งŒ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜์ด๋ฉฐ, ํ•จ์ˆ˜ rank๋Š” ๊ฐ’์˜ ์ˆœ์œ„๋ฅผ ๊ตฌํ•˜๋Š” ํ•จ์ˆ˜์ด๋‹ค. ์ด ๋‘ ํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•˜์—ฌ ์˜ค์ฆˆ๋น„๊ฐ€ ๋†’์€ ์ƒ์œ„ 10๊ฐœ์™€ ํ•˜์œ„ 10๊ฐœ์˜ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•  ์ˆ˜ ์žˆ๋‹ค.
๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด, ์ถ”์ถœํ•œ ๋‹จ์–ด ์ƒ์œ„ 10๊ฐœ๋Š” ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋” ์ž์ฃผ ์‚ฌ์šฉํ•˜์—ฌ ์˜ค์ฆˆ๋น„๊ฐ€ ๋†’์€ ๋‹จ์–ด์ด๋‹ค. โ€œ๋ณต์ง€๊ตญ๊ฐ€โ€, โ€œ์—ฌ์„ฑโ€, โ€œ๊ณตํ‰โ€ ๊ฐ™์€ ๋‹จ์–ด๋ฅผ ์ž์ฃผ ์‚ฌ์šฉํ•จ์œผ๋กœ์จ ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์ด ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น๋ณด๋‹ค ๋ณต์ง€์™€ ํ‰๋“ฑ์„ ๋” ๊ฐ•์กฐํ–ˆ๋‹ค๋Š” ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ๋‹ค.
๋ฐ˜๋Œ€๋กœ, ํ•˜์œ„ 10๊ฐœ๋Š” ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋” ์ž์ฃผ ์‚ฌ์šฉํ•˜์—ฌ ์˜ค์ฆˆ๋น„๊ฐ€ ๋‚ฎ์€ ๋‹จ์–ด์ด๋‹ค. โ€œ๋ฐ•๊ทผํ˜œโ€, โ€œ์—ฌ๋Ÿฌ๋ถ„โ€ ๊ฐ™์€ ๋‹จ์–ด๋ฅผ ๋ณด๋ฉด ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์ด ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น๋ณด๋‹ค ๊ฐœ์ธ์˜ ์ •์ฒด์„ฑ๊ณผ ๊ตญ๋ฏผ๊ณผ์˜ ์œ ๋Œ€๊ฐ์„ ๋” ๊ฐ•์กฐํ–ˆ๋‹ค๋Š” ๊ฒƒ์„ ์•Œ ์ˆ˜ ์žˆ๋‹ค.
Caution! ๋‹จ์–ด ๋นˆ๋„ ๋น„๊ต์—์„œ ๋‹จ์ˆœํžˆ ์‚ฌ์šฉ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋Š” โ€œ๊ตญ๋ฏผโ€, โ€œ์šฐ๋ฆฌโ€, โ€œ์‚ฌํšŒโ€ ๊ฐ™์€ ๋ณดํŽธ์ ์ธ ๋‹จ์–ด๋ผ ์—ฐ์„ค๋ฌธ์˜ ์ฐจ์ด๊ฐ€ ์ž˜ ๋“œ๋Ÿฌ๋‚˜์ง€ ์•Š์•˜๋‹ค. ๋ฐ˜๋ฉด, ์˜ค์ฆˆ๋น„ ๊ธฐ์ค€์œผ๋กœ ์ถ”์ถœํ•œ ๋‹จ์–ด๋Š” ๋‘ ์—ฐ์„ค๋ฌธ ์ค‘ ํ•œ์ชฝ์—์„œ ๋น„์ค‘์ด ๋” ํฐ ๋‹จ์–ด์ด๋ฏ€๋กœ ์ด๋ฅผ ํ†ตํ•ด ์—ฐ์„ค๋ฌธ์˜ ์ฐจ์ด๋ฅผ ๋ถ„๋ช…ํ•˜๊ฒŒ ์•Œ ์ˆ˜ ์žˆ๋‹ค.


2-4. ์‹œ๊ฐํ™”

# ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ์„ ์œ„ํ•œ ๋ณ€์ˆ˜ ์ถ”๊ฐ€
top10 <- top10 %>%                                           # ์˜ค์ฆˆ๋น„๊ฐ€ ๋†’์€ ์ƒ์œ„ 10๊ฐœ์˜ ๋‹จ์–ด์™€ ๋‚ฎ์€ ํ•˜์œ„ 10๊ฐœ์˜ ๋‹จ์–ด๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 2-3
  mutate(president = ifelse(odds_ratio > 1, "moon", "park"), # ์˜ค์ฆˆ๋น„๊ฐ€ 1๋ณด๋‹ค ํฌ๋ฉด ๋ณ€์ˆ˜ president์— "moon", ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด "park" ํ• ๋‹น -> ์˜ค์ฆˆ๋น„๊ฐ€ 1๋ณด๋‹ค ํฌ๋ฉด ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์  ๋น„์ค‘์ด ๋†’๊ธฐ ๋•Œ๋ฌธ
         n = ifelse(odds_ratio > 1, moon, park))             # ์˜ค์ฆˆ๋น„๊ฐ€ 1๋ณด๋‹ค ํฌ๋ฉด ๋ณ€์ˆ˜ n์— ๋ณ€์ˆ˜ moon์— ์ž…๋ ฅ๋œ ๊ฐ’ ํ• ๋‹น, ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด ๋ณ€์ˆ˜ park์— ์ž…๋ ฅ๋œ ๊ฐ’ ํ• ๋‹น

top10
# A tibble: 20 ร— 8
   word    moon  park ratio_moon ratio_park odds_ratio president     n
   <chr>  <int> <int>      <dbl>      <dbl>      <dbl> <chr>     <int>
 1 ๊ฐ•์ž       5     0   0.00262    0.000552     4.75   moon          5
 2 ๊ณตํ‰       5     0   0.00262    0.000552     4.75   moon          5
 3 ๋Œ€ํ†ตโ€ฆ      5     0   0.00262    0.000552     4.75   moon          5
 4 ๋ณดํ†ต       5     0   0.00262    0.000552     4.75   moon          5
 5 ๋ณต์ง€โ€ฆ      8     0   0.00393    0.000552     7.12   moon          8
 6 ์ƒ์ƒ       5     0   0.00262    0.000552     4.75   moon          5
 7 ์„ธ์ƒ       6     0   0.00306    0.000552     5.54   moon          6
 8 ์—ฌ๋Ÿฌ๋ถ„     2    20   0.00131    0.0116       0.113  park         20
 9 ์—ฌ์„ฑ       6     0   0.00306    0.000552     5.54   moon          6
10 ์ •์˜       6     0   0.00306    0.000552     5.54   moon          6
11 ์ง€๋ฐฉ       5     0   0.00262    0.000552     4.75   moon          5
12 ํ–‰๋ณต       3    23   0.00175    0.0132       0.132  park         23
13 ๊ณผ์ œ       0     4   0.000436   0.00276      0.158  park          4
14 ๊ตญ์ •โ€ฆ      0     4   0.000436   0.00276      0.158  park          4
15 ๋ฐ•๊ทผํ˜œ     0     8   0.000436   0.00496      0.0879 park          8
16 ์‹œ์ž‘       0     4   0.000436   0.00276      0.158  park          4
17 ์‹ค์ฒœ       0     5   0.000436   0.00331      0.132  park          5
18 ์ •๋ณด       0     5   0.000436   0.00331      0.132  park          5
19 ์ง€์‹       0     4   0.000436   0.00276      0.158  park          4
20 ํˆฌ๋ช…       0     5   0.000436   0.00331      0.132  park          5
ggplot(top10,                                      
       aes(x = reorder_within(word, n, president),  # reorder_within : ํ•ญ๋ชฉ๋ณ„๋กœ ๋‹จ์–ด ๋นˆ๋„์ˆœ ์ •๋ ฌ
           y = n,                           
           fill = president)) +                     # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น” ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                                      # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip() +                                    # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „ 
  facet_wrap(~president,                            # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋‘ ์ „ ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
             scales = "free_y") +                   # y์ถ• ํ†ต์ผ X
  scale_x_reordered()                               # ๋‹จ์–ด ๋’ค์˜ ๋Œ€ํ†ต๋ น ์ด๋ฆ„ ์ œ๊ฑฐ

Result! ๊ทธ๋ž˜ํ”„๋ฅผ ๋ณด๋ฉด, ์ „๋ฐ˜์ ์œผ๋กœ ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ๋†’์œผ๋ฉฐ ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ๋Š” ๋‚ฎ์€ ๊ฒƒ์ฒ˜๋Ÿผ ๋ณด์ธ๋‹ค. ์ด๋Š” ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๊ฐ€์žฅ ๋งŽ์ด ์‚ฌ์šฉํ•œ ๋‹จ์–ด์ธ โ€œํ–‰๋ณตโ€์˜ ๋นˆ๋„๋ฅผ ๊ธฐ์ค€์œผ๋กœ ๋‘ ๊ทธ๋ž˜ํ”„์˜ x์ถ• ํฌ๊ธฐ๋ฅผ ๋˜‘๊ฐ™์ด ๊ณ ์ •ํ–ˆ๊ธฐ ๋•Œ๋ฌธ์ด๋‹ค. ๊ทธ๋ž˜ํ”„๋ณ„๋กœ x์ถ• ํฌ๊ธฐ๋ฅผ ๋‹ค๋ฅด๊ฒŒ ์ •ํ•ด์•ผ ๊ฐ ์—ฐ์„ค๋ฌธ์˜ ๋‹จ์–ด ๋น„์ค‘์„ ์ œ๋Œ€๋กœ ์•Œ ์ˆ˜ ์žˆ๋‹ค. ์ด๋ฅผ ์œ„ํ•ด ํ•จ์ˆ˜ facet_wrap์˜ ์˜ต์…˜ scales = "free"์„ ์ง€์ •ํ•˜์—ฌ x์ถ•๊ณผ y์ถ•์˜ ํฌ๊ธฐ๋ฅผ ๋ชจ๋‘ ๊ทธ๋ž˜ํ”„๋ณ„๋กœ ์ •ํ•  ์ˆ˜ ์žˆ๋‹ค.

ggplot(top10,                                      
       aes(x = reorder_within(word, n, president),  # reorder_within : ํ•ญ๋ชฉ๋ณ„๋กœ ๋‹จ์–ด ๋นˆ๋„์ˆœ ์ •๋ ฌ
           y = n,                           
           fill = president)) +                     # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น” ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                                      # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip() +                                    # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „ 
  facet_wrap(~president,                            # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋‘ ์ „ ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
             scales = "free") +                     # x์ถ•๊ณผ y์ถ• ํ†ต์ผ X
  scale_x_reordered()                               # ๋‹จ์–ด ๋’ค์˜ ๋Œ€ํ†ต๋ น ์ด๋ฆ„ ์ œ๊ฑฐ

Result! ๊ทธ๋ž˜ํ”„๋ฅผ ๋ณด๋ฉด, ๊ฐ ์—ฐ์„ค๋ฌธ์—์„œ ๋งŽ์ด ์‚ฌ์šฉํ•œ ๋‹จ์–ด โ€œ๋ณต์ง€๊ตญ๊ฐ€โ€์™€ โ€œํ–‰๋ณตโ€์˜ ๋ง‰๋Œ€ ๊ธธ์ด๋Š” ๊ฐ™์ง€๋งŒ ๋นˆ๋„๊ฐ€ ๋‹ค๋ฅด๋‹ค. ์ด์ฒ˜๋Ÿผ x์ถ• ํฌ๊ธฐ๊ฐ€ ๊ทธ๋ž˜ํ”„๋งˆ๋‹ค ๋‹ค๋ฅด๋ฉด ๋ง‰๋Œ€ ๊ธธ์ด๊ฐ€ ๊ฐ™์•„๋„ ์‹ค์ œ ๊ฐ’์€ ๋‹ค๋ฅด๊ธฐ ๋•Œ๋ฌธ์— ํ•ด์„ํ•  ๋•Œ ์กฐ์‹ฌํ•ด์•ผ ํ•œ๋‹ค.
Caution! ์˜ค์ฆˆ๋น„๋ฅผ ์ด์šฉํ•ด ๋งŒ๋“  ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„๋Š” ๊ฐ ํ…์ŠคํŠธ์—์„œ ์ƒ๋Œ€์ ์œผ๋กœ ์ค‘์š”ํ•œ ๋‹จ์–ด๊ฐ€ ๋ฌด์—‡์ธ์ง€ ํ‘œํ˜„ํ•˜๊ธฐ ์œ„ํ•ด ๋งŒ๋“ ๋‹ค. ๋ง‰๋Œ€ ๊ธธ์ด๋ฅผ ๋ณด๊ณ  ๋‘ ํ…์ŠคํŠธ์˜ ๋‹จ์–ด ๋นˆ๋„๋ฅผ ๋น„๊ตํ•˜๋ฉด ์•ˆ ๋˜๊ณ , ๊ฐ ํ…์ŠคํŠธ์—์„œ ์ƒ๋Œ€์ ์œผ๋กœ ์ค‘์š”ํ•œ ๋‹จ์–ด๊ฐ€ ๋ฌด์—‡์ธ์ง€๋งŒ ์‚ดํŽด๋ด์•ผ ํ•œ๋‹ค.


2-5. ์ฃผ์š” ๋‹จ์–ด๊ฐ€ ์‚ฌ์šฉ๋œ ๋ฌธ์žฅ ์ถ”์ถœ

speeches_sentence <- bind_speeches %>%              # ๋‘ ์—ฐ์„ค๋ฌธ์˜ ์›๋ฌธ์„ ํ•˜๋‚˜์˜ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๊ฒฐํ•ฉํ•œ Dataset in 1
  as_tibble() %>%                                   # Tibble ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
  unnest_tokens(input = value,                      # ํ† ํฐํ™”๋ฅผ ์ˆ˜ํ–‰ํ•  ํ…์ŠคํŠธ๊ฐ€ ํฌํ•จ๋œ ๋ณ€์ˆ˜๋ช…
                output = sentence,                  # ์ถœ๋ ฅ ๋ณ€์ˆ˜๋ช…
                token = "sentences")                # ๋ฌธ์žฅ ๊ธฐ์ค€์œผ๋กœ ํ† ํฐํ™”

head(speeches_sentence)                             # ํ† ํฐํ™” ๊ฒฐ๊ณผ ์•ž๋ถ€๋ถ„ ์ถœ๋ ฅ
# A tibble: 6 ร— 2
  president sentence                                                 
  <chr>     <chr>                                                    
1 moon      "์ •๊ถŒ๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                                   
2 moon      "์ •์น˜๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                                   
3 moon      "์‹œ๋Œ€๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!"                                   
4 moon      ""                                                       
5 moon      "โ€˜๋ถˆ๋น„๋ถˆ๋ช…(ไธ้ฃ›ไธ้ณด)โ€™์ด๋ผ๋Š” ๊ณ ์‚ฌ๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค."            
6 moon      "๋‚จ์ชฝ ์–ธ๋• ๋‚˜๋ญ‡๊ฐ€์ง€์— ์•‰์•„, 3๋…„ ๋™์•ˆ ๋‚ ์ง€๋„ ์šธ์ง€๋„ ์•Š๋Š” โ€ฆ
tail(speeches_sentence)                             # ํ† ํฐํ™” ๊ฒฐ๊ณผ ๋’ท๋ถ€๋ถ„ ์ถœ๋ ฅ
# A tibble: 6 ร— 2
  president sentence                                                  
  <chr>     <chr>                                                     
1 park      ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„์˜ ํ–‰๋ณต์ด ๊ณง ์ €์˜ ํ–‰๋ณต์ž…๋‹ˆ๋‹ค.                  
2 park      ์‚ฌ๋ž‘ํ•˜๋Š” ์กฐ๊ตญ ๋Œ€ํ•œ๋ฏผ๊ตญ๊ณผ ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„์„ ์œ„ํ•ด, ์•ž์œผ๋กœ ๋จธ๋‚˜ โ€ฆ
3 park      ๊ทธ ๊ธธ์„ ํ•จ๊ป˜ ํ•ด์ฃผ์‹œ๊ธธ ๋ถ€ํƒ๋“œ๋ฆฝ๋‹ˆ๋‹ค.                       
4 park      ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค.                                               
5 park      2012๋…„ 7์›” 10์ผ                                           
6 park      ์ƒˆ๋ˆ„๋ฆฌ๋‹น ์˜ˆ๋น„ํ›„๋ณด ๋ฐ•๊ทผํ˜œ                                  
speeches_sentence %>%
  filter(president == "moon" & str_detect(sentence, "๋ณต์ง€๊ตญ๊ฐ€"))  # ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ "๋ณต์ง€๊ตญ๊ฐ€"๊ฐ€ ํฌํ•จ๋œ ๋ฌธ์žฅ ์ถ”์ถœ
# A tibble: 8 ร— 2
  president sentence                                                  
  <chr>     <chr>                                                     
1 moon      โ€˜๊ฐ•ํ•œ ๋ณต์ง€๊ตญ๊ฐ€โ€™๋ฅผ ํ–ฅํ•ด ๋‹ด๋Œ€ํ•˜๊ฒŒ ๋‚˜์•„๊ฐ€๊ฒ ์Šต๋‹ˆ๋‹ค.           
2 moon      2๋ฐฑ ๋…„ ์ „ ์ด์™€ ๊ฐ™์€ ์†Œ๋“์žฌ๋ถ„๋ฐฐ, ๋ณต์ง€๊ตญ๊ฐ€์˜ ์‚ฌ์ƒ์„ ๊ฐ€์ง„ ์œ„โ€ฆ
3 moon      ์ด์ œ ์šฐ๋ฆฌ๋Š” ๋ณต์ง€๊ตญ๊ฐ€๋ฅผ ํ–ฅํ•ด ๋‹ด๋Œ€ํ•˜๊ฒŒ ๋‚˜์•„๊ฐˆ ๋•Œ์ž…๋‹ˆ๋‹ค.     
4 moon      ๋ถ€์ž๊ฐ์„ธ, 4๋Œ€๊ฐ• ์‚ฌ์—… ๊ฐ™์€ ์‹œ๋Œ€์ฐฉ์˜ค์  ๊ณผ์˜ค๋ฅผ ์ฒญ์‚ฐํ•˜๊ณ , ํ•˜โ€ฆ 
5 moon      ์šฐ๋ฆฌ๋Š” ์ง€๊ธˆ ๋ณต์ง€๊ตญ๊ฐ€๋กœ ๊ฐ€๋Š๋ƒ, ์–‘๊ทนํ™”์˜ ๋ถ„์—ด๋œ ๊ตญ๊ฐ€๋กœ ๊ฐ€โ€ฆ 
6 moon      ๊ฐ•ํ•œ ๋ณต์ง€๊ตญ๊ฐ€์ผ์ˆ˜๋ก ๊ตญ๊ฐ€ ๊ฒฝ์Ÿ๋ ฅ๋„ ๋” ๋†’์Šต๋‹ˆ๋‹ค.            
7 moon      ๊ฒฐ๊ตญ ๋ณต์ง€๊ตญ๊ฐ€๋กœ ๊ฐ€๋Š” ๊ธธ์€ ์‚ฌ๋žŒ์— ๋Œ€ํ•œ ํˆฌ์ž, ์ผ์ž๋ฆฌ ์ฐฝ์ถœ, โ€ฆ
8 moon      ์šฐ๋ฆฌ๋Š” ๊ณผ๊ฐํžˆ ๊ฐ•ํ•œ ๋ณดํŽธ์  ๋ณต์ง€๊ตญ๊ฐ€๋กœ ๊ฐ€์•ผ ํ•ฉ๋‹ˆ๋‹ค.         
speeches_sentence %>%
  filter(president == "park" & str_detect(sentence, "ํ–‰๋ณต"))      # ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ "ํ–‰๋ณต"์ด ํฌํ•จ๋œ ๋ฌธ์žฅ ์ถ”์ถœ
# A tibble: 19 ร— 2
   president sentence                                                 
   <chr>     <chr>                                                    
 1 park      ์ €๋Š” ์˜ค๋Š˜, ๊ตญ๋ฏผ ํ•œ ๋ถ„ ํ•œ ๋ถ„์˜ ๊ฟˆ์ด ์ด๋ฃจ์–ด์ง€๋Š” ํ–‰๋ณตํ•œ ๋Œ€โ€ฆ 
 2 park      ๊ตญ๊ฐ€๋Š” ๋ฐœ์ „ํ–ˆ๊ณ , ๊ฒฝ์ œ๋Š” ์„ฑ์žฅํ–ˆ๋‹ค๋Š”๋ฐ, ๋‚˜์˜ ์‚ถ์€ ๋‚˜์•„์ง€์ง€โ€ฆ
 3 park      ๊ณผ๊ฑฐ์—๋Š” ๊ตญ๊ฐ€์˜ ๋ฐœ์ „์ด ๊ตญ๋ฏผ์˜ ํ–‰๋ณต์œผ๋กœ ์ด์–ด์กŒ์Šต๋‹ˆ๋‹ค.     
 4 park      ๊ฐœ์ธ์˜ ์ฐฝ์˜๋ ฅ์ด ์ค‘์š”ํ•œ ์ง€์‹๊ธฐ๋ฐ˜์‚ฌํšŒ์—์„œ๋Š” ๊ตญ๋ฏผ ํ•œ ์‚ฌ๋žŒ, โ€ฆ
 5 park      ์ด์ œ ๊ตญ์ •์šด์˜์˜ ํŒจ๋Ÿฌ๋‹ค์ž„์„ ๊ตญ๊ฐ€์—์„œ ๊ตญ๋ฏผ์œผ๋กœ, ๊ฐœ์ธ์˜ ์‚ถโ€ฆ 
 6 park      ๊ตญ๋ฏผ ๊ฐœ๊ฐœ์ธ์˜ ๊ฟˆ์„ ํ–ฅํ•œ ๋…ธ๋ ฅ์ด ๊ตญ๊ฐ€๋ฅผ ๋ฐœ์ „์‹œํ‚ค๊ณ  ๊ตญ๊ฐ€ ๋ฐœโ€ฆ
 7 park      ์ €๋Š” โ€˜๊ฒฝ์ œ๋ฏผ์ฃผํ™” ์‹คํ˜„โ€™, โ€˜์ผ์ž๋ฆฌ ์ฐฝ์ถœโ€™, ๊ทธ๋ฆฌ๊ณ  โ€˜ํ•œ๊ตญโ€ฆ     
 8 park      ๊ตญ๋ฏผํ–‰๋ณต์˜ ๊ธธ์„ ์—ด์–ด๊ฐˆ ์ฒซ ๋ฒˆ์งธ ๊ณผ์ œ๋กœ, ์ €๋Š” ๊ฒฝ์ œ๋ฏผ์ฃผํ™”๋ฅผโ€ฆ
 9 park      ๊ตญ๋ฏผํ–‰๋ณต์˜ ๊ธธ์„ ์—ด์–ด๊ฐˆ ๋‘ ๋ฒˆ์งธ ๊ณผ์ œ๋กœ, ์ €๋Š” ์ข‹์€ ์ผ์ž๋ฆฌ โ€ฆ
10 park      ๊ตญ๋ฏผํ–‰๋ณต์˜ ๊ธธ์„ ์—ด์–ด๊ฐˆ ์„ธ ๋ฒˆ์งธ ๊ณผ์ œ๋กœ, ์šฐ๋ฆฌ์˜ ์‹ค์ •์— ๋งžโ€ฆ 
11 park      ์ €๋Š” ๊ตญ๋ฏผํ–‰๋ณต์„ ์œ„ํ•ด โ€˜๊ฒฝ์ œ๋ฏผ์ฃผํ™”-์ผ์ž๋ฆฌ-๋ณต์ง€โ€™๋ฅผ ์•„์šฐ๋ฅดโ€ฆ  
12 park      ๋ชจ๋“  ๊ณ„์ธต์˜ ๊ตญ๋ฏผ์ด ํ•จ๊ป˜ ์ฐธ์—ฌํ•ด ๋งŒ๋“ค๊ณ , ์ •๋ถ€์™€ ๊ธฐ์—…, ์ง€์—ญโ€ฆ
13 park      50๋…„ ์ „ ๊ฒฝ์ œ๊ฐœ๋ฐœ 5๊ฐœ๋…„ ๊ณ„ํš์ด ์‚ฐ์—…ํ™”์˜ ๊ธฐ์ ์„ ์ด๋ค„๋ƒˆ๋“ฏ,โ€ฆ 
14 park      ์ €๋Š” ์ง€์†๊ฐ€๋Šฅํ•œ ๊ตญ๋ฏผ ํ–‰๋ณต์„ ๋งŒ๋“ค ์ˆ˜ ์žˆ๋„๋ก,์‚ฌ๋žŒ์— ๋Œ€ํ•œ โ€ฆ 
15 park      ์ € ๋ฐ•๊ทผํ˜œ, ๊ฒฝ์Ÿ๊ณผ ์ž…์‹œ์— ๋งค๋ชฐ๋œ ๊ต์œก์„โ€˜ํ•จ๊ป˜ํ•˜๋Š” ํ–‰๋ณต๊ตโ€ฆ  
16 park      ์กด๊ฒฝํ•˜๋Š” ๊ตญ๋ฏผ์—ฌ๋Ÿฌ๋ถ„, ๊ตญ๋ฏผํ–‰๋ณต์„ ์œ„ํ•œ ๋…ธ๋ ฅ์ด ์•ˆ์ •์ ์œผ๋กœ โ€ฆ 
17 park      ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„, ๊ตญ๋ฏผํ–‰๋ณต์˜ ๊ฟˆ์„ ์ด๋ค„๋‚ด๊ธฐ ์œ„ํ•ด์„œ๋Š”, ๋จผ์ € ์ •โ€ฆ 
18 park      ๊ตญ๋ฏผ๋“ค์ด ๊ฟˆ์œผ๋กœ๋งŒ ๊ฐ€์กŒ๋˜ ํ–‰๋ณตํ•œ ์‚ถ์„ ์‹ค์ œ๋กœ ์ด๋ฃฐ ์ˆ˜ ์žˆ๋„โ€ฆ
19 park      ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„์˜ ํ–‰๋ณต์ด ๊ณง ์ €์˜ ํ–‰๋ณต์ž…๋‹ˆ๋‹ค.                 

Result! ํ•จ์ˆ˜ filter์™€ str_detect๋ฅผ ์ด์šฉํ•˜์—ฌ ๊ฐ ์—ฐ์„ค๋ฌธ์—์„œ ์ฃผ์š” ๋‹จ์–ด๋ฅผ ์‚ฌ์šฉํ•œ ๋ฌธ์žฅ์„ ์ถ”์ถœํ•  ์ˆ˜ ์žˆ๋‹ค. ์ถ”์ถœํ•œ ๋ฌธ์žฅ์„ ๋ณด๋ฉด ๋‹จ์–ด๊ฐ€ ์–ด๋–ค ์˜๋ฏธ๋กœ ์‚ฌ์šฉ๋˜์—ˆ๋Š”์ง€ ์•Œ ์ˆ˜ ์žˆ๋‹ค.


2-6. ์ค‘์š”๋„๊ฐ€ ๋น„์Šทํ•œ ๋‹จ์–ด ์ถ”์ถœ

odds_df %>%                                  # ์˜ค์ฆˆ๋น„๋ฅผ ๊ณ„์‚ฐํ•œ ๊ฒฐ๊ณผ๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 2-2-2
  arrange(abs(1 - odds_ratio)) %>%           # ์˜ค์ฆˆ๋น„๊ฐ€ 1์— ๊ฐ€๊นŒ์šด ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
  head(10)                                   # ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด ์ถ”์ถœ
# A tibble: 10 ร— 6
   word    moon  park ratio_moon ratio_park odds_ratio
   <chr>  <int> <int>      <dbl>      <dbl>      <dbl>
 1 ๋•Œ๋ฌธ       4     3    0.00218    0.00221      0.989
 2 ๊ฐ•ํ™”       3     2    0.00175    0.00165      1.06 
 3 ๋ถ€๋‹ด       3     2    0.00175    0.00165      1.06 
 4 ์„ธ๊ณ„       3     2    0.00175    0.00165      1.06 
 5 ์ฑ…์ž„       3     2    0.00175    0.00165      1.06 
 6 ํ˜‘๋ ฅ       3     2    0.00175    0.00165      1.06 
 7 ๊ฑฐ๋Œ€       2     1    0.00131    0.00110      1.19 
 8 ๊ต์ฒด       2     1    0.00131    0.00110      1.19 
 9 ๊ทผ๋ณธ์      2     1    0.00131    0.00110      1.19 
10 ๊ธฐ๋ฐ˜       2     1    0.00131    0.00110      1.19 

Result! ์ถœ๋ ฅ ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด ๋Œ€๋ถ€๋ถ„ ๋ณดํŽธ์ ์ธ ์˜๋ฏธ๋ฅผ ์ง€๋‹ˆ๋Š” ๋‹จ์–ด์ด๋ฉฐ, ์ด๋Ÿฌํ•œ ๋‹จ์–ด๋“ค์€ ๋นˆ๋„๊ฐ€ ๋‚ฎ๊ธฐ ๋•Œ๋ฌธ์— ๊ฐ•์กฐํ•œ ๋‹จ์–ด๋Š” ์•„๋‹ˆ๋‹ค.

# ์ค‘์š”๋„๊ฐ€ ๋น„์Šทํ•˜๋ฉด์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด ์ถ”์ถœ
odds_df %>%                                  # ์˜ค์ฆˆ๋น„๋ฅผ ๊ณ„์‚ฐํ•œ ๊ฒฐ๊ณผ๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 2-2-2
  filter(moon >= 5 & park >= 5) %>%          # ๋‘ ์—ฐ์„ค๋ฌธ์—์„œ 5๋ฒˆ ์ด์ƒ ์‚ฌ์šฉํ•œ ๋‹จ์–ด๋งŒ ์ถ”์ถœ
  arrange(abs(1 - odds_ratio)) %>%           # ์˜ค์ฆˆ๋น„๊ฐ€ 1์— ๊ฐ€๊นŒ์šด ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
  head(10)                                   # ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด ์ถ”์ถœ
# A tibble: 10 ร— 6
   word      moon  park ratio_moon ratio_park odds_ratio
   <chr>    <int> <int>      <dbl>      <dbl>      <dbl>
 1 ์‚ฌํšŒ        14     9    0.00655    0.00552      1.19 
 2 ์‚ฌ๋žŒ         9     9    0.00436    0.00552      0.791
 3 ๊ฒฝ์ œ        15    15    0.00698    0.00883      0.791
 4 ์ง€์›         5     5    0.00262    0.00331      0.791
 5 ์šฐ๋ฆฌ        17    10    0.00786    0.00607      1.29 
 6 ๋ถˆ์•ˆ         7     8    0.00349    0.00496      0.703
 7 ์‚ฐ์—…         9     5    0.00436    0.00331      1.32 
 8 ๋Œ€ํ•œ๋ฏผ๊ตญ    11     6    0.00524    0.00386      1.36 
 9 ๊ตญ๊ฐ€         7    10    0.00349    0.00607      0.576
10 ๊ต์œก         6     9    0.00306    0.00552      0.554

Result! ๋‘ ์—ฐ์„ค๋ฌธ ๋ชจ๋‘ โ€œ์‚ฌํšŒโ€, โ€œ์‚ฌ๋žŒโ€, โ€œ๊ฒฝ์ œโ€ ๋“ฑ์„ ๊ฐ•์กฐํ–ˆ์Œ์„ ์•Œ ์ˆ˜ ์žˆ๋‹ค.


3. ๋กœ๊ทธ ์˜ค์ฆˆ๋น„



3-1. ๋กœ๊ทธ ์˜ค์ฆˆ๋น„ ๋ณ€์ˆ˜ ์ถ”๊ฐ€

logodds_df <- odds_df %>%                    # ์˜ค์ฆˆ๋น„๋ฅผ ๊ณ„์‚ฐํ•œ ๊ฒฐ๊ณผ๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 2-2-2
  mutate(log_odds_ratio = log(odds_ratio))   # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„ ๊ณ„์‚ฐ

# ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์  ๋น„์ค‘์ด ํฐ ๋‹จ์–ด
logodds_df %>%
  arrange(desc(log_odds_ratio))              # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
# A tibble: 955 ร— 7
   word     moon  park ratio_moon ratio_park odds_ratio log_odds_ratio
   <chr>   <int> <int>      <dbl>      <dbl>      <dbl>          <dbl>
 1 ๋ณต์ง€๊ตญโ€ฆ     8     0    0.00393   0.000552       7.12           1.96
 2 ์„ธ์ƒ        6     0    0.00306   0.000552       5.54           1.71
 3 ์—ฌ์„ฑ        6     0    0.00306   0.000552       5.54           1.71
 4 ์ •์˜        6     0    0.00306   0.000552       5.54           1.71
 5 ๊ฐ•์ž        5     0    0.00262   0.000552       4.75           1.56
 6 ๊ณตํ‰        5     0    0.00262   0.000552       4.75           1.56
 7 ๋Œ€ํ†ต๋ นโ€ฆ     5     0    0.00262   0.000552       4.75           1.56
 8 ๋ณดํ†ต        5     0    0.00262   0.000552       4.75           1.56
 9 ์ƒ์ƒ        5     0    0.00262   0.000552       4.75           1.56
10 ์ง€๋ฐฉ        5     0    0.00262   0.000552       4.75           1.56
# โ„น 945 more rows
# ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ๋Œ€์  ๋น„์ค‘์ด ํฐ ๋‹จ์–ด
logodds_df %>%
  arrange(log_odds_ratio)                    # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„ ์˜ค๋ฆ„์ฐจ์ˆœ ์ •๋ ฌ
# A tibble: 955 ร— 7
   word     moon  park ratio_moon ratio_park odds_ratio log_odds_ratio
   <chr>   <int> <int>      <dbl>      <dbl>      <dbl>          <dbl>
 1 ๋ฐ•๊ทผํ˜œ      0     8   0.000436    0.00496     0.0879          -2.43
 2 ์—ฌ๋Ÿฌ๋ถ„      2    20   0.00131     0.0116      0.113           -2.18
 3 ํ–‰๋ณต        3    23   0.00175     0.0132      0.132           -2.03
 4 ์‹ค์ฒœ        0     5   0.000436    0.00331     0.132           -2.03
 5 ์ •๋ณด        0     5   0.000436    0.00331     0.132           -2.03
 6 ํˆฌ๋ช…        0     5   0.000436    0.00331     0.132           -2.03
 7 ๊ณผ์ œ        0     4   0.000436    0.00276     0.158           -1.84
 8 ๊ตญ์ •์šดโ€ฆ     0     4   0.000436    0.00276     0.158           -1.84
 9 ์‹œ์ž‘        0     4   0.000436    0.00276     0.158           -1.84
10 ์ง€์‹        0     4   0.000436    0.00276     0.158           -1.84
# โ„น 945 more rows
# ๋‘ ์—ฐ์„ค๋ฌธ์—์„œ ๋น„์ค‘์ด ๋น„์Šทํ•œ ๋‹จ์–ด
logodds_df %>%
  arrange(abs(log_odds_ratio))
# A tibble: 955 ร— 7
   word    moon  park ratio_moon ratio_park odds_ratio log_odds_ratio
   <chr>  <int> <int>      <dbl>      <dbl>      <dbl>          <dbl>
 1 ๋•Œ๋ฌธ       4     3    0.00218    0.00221      0.989        -0.0109
 2 ๊ฐ•ํ™”       3     2    0.00175    0.00165      1.06          0.0537
 3 ๋ถ€๋‹ด       3     2    0.00175    0.00165      1.06          0.0537
 4 ์„ธ๊ณ„       3     2    0.00175    0.00165      1.06          0.0537
 5 ์ฑ…์ž„       3     2    0.00175    0.00165      1.06          0.0537
 6 ํ˜‘๋ ฅ       3     2    0.00175    0.00165      1.06          0.0537
 7 ๊ฑฐ๋Œ€       2     1    0.00131    0.00110      1.19          0.171 
 8 ๊ต์ฒด       2     1    0.00131    0.00110      1.19          0.171 
 9 ๊ทผ๋ณธ์      2     1    0.00131    0.00110      1.19          0.171 
10 ๊ธฐ๋ฐ˜       2     1    0.00131    0.00110      1.19          0.171 
# โ„น 945 more rows

3-2. ์‹œ๊ฐํ™”

# ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ์„ ์œ„ํ•œ ๋ณ€์ˆ˜ ์ถ”๊ฐ€
top10 <- logodds_df %>%                                                 # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 3-1
  group_by(president = ifelse(log_odds_ratio > 0, "moon", "park")) %>%  # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„๊ฐ€ ์–‘์ˆ˜์ด๋ฉด ๋ณ€์ˆ˜ president์— "moon", ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด "park" ํ• ๋‹นํ•œ ํ›„ ๊ทธ๋ฃนํ™”
  slice_max(abs(log_odds_ratio), n = 10,                                # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„์˜ ์ ˆ๋Œ“๊ฐ’ ๊ธฐ์ค€์œผ๋กœ ์ƒ์œ„ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ -> ์•ž์—์„œ ๊ทธ๋ฃนํ™”๋ฅผ ์ˆ˜ํ–‰ํ–ˆ๊ธฐ ๋•Œ๋ฌธ์— ๊ฐ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ์œ„ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ
            with_ties = F)                                              # ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์˜ต์…˜ n์— ์ง€์ •ํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด ์ถ”์ถœ

top10 %>% 
  arrange(desc(log_odds_ratio)) %>%                                     # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
  select(word, log_odds_ratio, president) %>%                           # ๋ณ€์ˆ˜ word, log_odds_ratio, president ์„ ํƒ
  print(n = Inf)                                                        # ๋ชจ๋“  ํ–‰ ์ถœ๋ ฅ
# A tibble: 20 ร— 3
# Groups:   president [2]
   word     log_odds_ratio president
   <chr>             <dbl> <chr>    
 1 ๋ณต์ง€๊ตญ๊ฐ€           1.96 moon     
 2 ์„ธ์ƒ               1.71 moon     
 3 ์—ฌ์„ฑ               1.71 moon     
 4 ์ •์˜               1.71 moon     
 5 ๊ฐ•์ž               1.56 moon     
 6 ๊ณตํ‰               1.56 moon     
 7 ๋Œ€ํ†ต๋ น์˜           1.56 moon     
 8 ๋ณดํ†ต               1.56 moon     
 9 ์ƒ์ƒ               1.56 moon     
10 ์ง€๋ฐฉ               1.56 moon     
11 ๊ณผ์ œ              -1.84 park     
12 ๊ตญ์ •์šด์˜          -1.84 park     
13 ์‹œ์ž‘              -1.84 park     
14 ์ง€์‹              -1.84 park     
15 ํ–‰๋ณต              -2.03 park     
16 ์‹ค์ฒœ              -2.03 park     
17 ์ •๋ณด              -2.03 park     
18 ํˆฌ๋ช…              -2.03 park     
19 ์—ฌ๋Ÿฌ๋ถ„            -2.18 park     
20 ๋ฐ•๊ทผํ˜œ            -2.43 park     
ggplot(top10, 
       aes(x = reorder(word, log_odds_ratio),        # reorder : top10์—์„œ ๋‹จ์–ด์— ๋”ฐ๋ฅธ ํ‰๊ท  ๋กœ๊ทธ ์˜ค์ฆˆ๋น„๋ฅผ ์ด์šฉํ•˜์—ฌ ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ
           y = log_odds_ratio,             
           fill = president)) +                      # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น”์„ ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„
  geom_col() +                                       # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
  coord_flip()                                       # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „

Caution! ์˜ค์ฆˆ๋น„/๋กœ๊ทธ ์˜ค์ฆˆ๋น„๋Š” ๋‘ ์กฐ๊ฑด์˜ ํ™•๋ฅ ์„ ์ด์šฉํ•ด ๊ณ„์‚ฐํ•˜๋ฏ€๋กœ ์„ธ ๊ฐœ ์ด์ƒ์˜ ํ…์ŠคํŠธ๋ฅผ ๋น„๊ตํ•  ๋•Œ ์ ์ ˆํ•˜์ง€ ์•Š๋‹ค๋Š” ๋‹จ์ ์ด ์žˆ๋‹ค. ํ…์ŠคํŠธ๋ฅผ ๋‘˜์”ฉ ์ง์ง€์–ด ๋”ฐ๋กœ ๋น„๊ตํ•  ์ˆ˜๋„ ์žˆ์ง€๋งŒ, ๋น„๊ตํ•  ํ…์ŠคํŠธ๊ฐ€ ๋งŽ์œผ๋ฉด ๊ณ„์‚ฐ ์ ˆ์ฐจ๊ฐ€ ๋ฒˆ๊ฑฐ๋กญ๊ณ  ๊ฒฐ๊ณผ๋ฅผ ํ•ด์„ํ•˜๊ธฐ ์–ด๋ ต๊ธฐ ๋•Œ๋ฌธ์— ํšจ์œจ์ ์ด์ง€ ์•Š๋‹ค.


4. TF-IDF





# ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
# speeches_presidents : ์—ญ๋Œ€ ๋Œ€ํ†ต๋ น์˜ ๋Œ€์„  ์ถœ๋งˆ ์„ ์–ธ๋ฌธ์„ ๋‹ด์€ ๋ฐ์ดํ„ฐ ํŒŒ์ผ
raw_speeches <- read_csv(".../speeches_presidents.csv")
raw_speeches
# A tibble: 4 ร— 2
  president value                                                     
  <chr>     <chr>                                                     
1 ๋ฌธ์žฌ์ธ    "์ •๊ถŒ๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!   ์ •์น˜๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค!   ์‹œ๋Œ€๊ต์ฒด โ€ฆ 
2 ๋ฐ•๊ทผํ˜œ    "์กด๊ฒฝํ•˜๋Š” ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„! ์ €๋Š” ์˜ค๋Š˜, ๊ตญ๋ฏผ ํ•œ ๋ถ„ ํ•œ ๋ถ„์˜ ๊ฟˆ์ดโ€ฆ
3 ์ด๋ช…๋ฐ•    "์กด๊ฒฝํ•˜๋Š” ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„, ์‚ฌ๋ž‘ํ•˜๋Š” ํ•œ๋‚˜๋ผ๋‹น ๋‹น์› ๋™์ง€ ์—ฌ๋Ÿฌ๋ถ„โ€ฆ
4 ๋…ธ๋ฌดํ˜„    "์–ด๋Š๋•Œ์ธ๊ฐ€ ๋ถ€ํ„ฐ ์ œ๊ฐ€ ๋Œ€ํ†ต๋ น์ด ๋˜๊ฒ ๋‹ค๊ณ  ๋ง์„ ํ•˜๊ธฐ ์‹œ์ž‘ํ–ˆโ€ฆ 
# ์ „์ฒ˜๋ฆฌ
speeches <- raw_speeches %>%
  mutate(value = str_replace_all(value,      
                                 "[^๊ฐ€-ํžฃ]", # [^๊ฐ€-ํžฃ] : ํ•œ๊ธ€์„ ์ œ์™ธํ•œ ๋ชจ๋“  ๋ฌธ์ž๋ฅผ ์˜๋ฏธํ•˜๋Š” ์ •๊ทœ ํ‘œํ˜„์‹
                                 " "),       # ๊ณต๋ฐฑ์œผ๋กœ ๋ณ€๊ฒฝ
         value = str_squish(value))          # ์—ฐ์†๋œ ๊ณต๋ฐฑ ์ œ๊ฑฐ

speeches
# A tibble: 4 ร— 2
  president value                                                     
  <chr>     <chr>                                                     
1 ๋ฌธ์žฌ์ธ    ์ •๊ถŒ๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค ์ •์น˜๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆ๋‹ค ์‹œ๋Œ€๊ต์ฒด ํ•˜๊ฒ ์Šต๋‹ˆโ€ฆ
2 ๋ฐ•๊ทผํ˜œ    ์กด๊ฒฝํ•˜๋Š” ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„ ์ €๋Š” ์˜ค๋Š˜ ๊ตญ๋ฏผ ํ•œ ๋ถ„ ํ•œ ๋ถ„์˜ ๊ฟˆ์ด ์ดโ€ฆ
3 ์ด๋ช…๋ฐ•    ์กด๊ฒฝํ•˜๋Š” ๊ตญ๋ฏผ ์—ฌ๋Ÿฌ๋ถ„ ์‚ฌ๋ž‘ํ•˜๋Š” ํ•œ๋‚˜๋ผ๋‹น ๋‹น์› ๋™์ง€ ์—ฌ๋Ÿฌ๋ถ„ โ€ฆ 
4 ๋…ธ๋ฌดํ˜„    ์–ด๋Š๋•Œ์ธ๊ฐ€ ๋ถ€ํ„ฐ ์ œ๊ฐ€ ๋Œ€ํ†ต๋ น์ด ๋˜๊ฒ ๋‹ค๊ณ  ๋ง์„ ํ•˜๊ธฐ ์‹œ์ž‘ํ–ˆ์Šตโ€ฆ
# ํ† ํฐํ™”
speeches <- speeches %>%
  unnest_tokens(input = value,               # ํ† ํฐํ™”๋ฅผ ์ˆ˜ํ–‰ํ•  ํ…์ŠคํŠธ๊ฐ€ ํฌํ•จ๋œ ๋ณ€์ˆ˜๋ช…
                output = word,               # ์ถœ๋ ฅ ๋ณ€์ˆ˜๋ช…
                token = extractNoun)         # ๋ช…์‚ฌ ๊ธฐ์ค€์œผ๋กœ ํ† ํฐํ™”
speeches
# A tibble: 3,838 ร— 2
   president word    
   <chr>     <chr>   
 1 ๋ฌธ์žฌ์ธ    ์ •๊ถŒ๊ต์ฒด
 2 ๋ฌธ์žฌ์ธ    ์ •์น˜    
 3 ๋ฌธ์žฌ์ธ    ๊ต์ฒด    
 4 ๋ฌธ์žฌ์ธ    ์‹œ๋Œ€    
 5 ๋ฌธ์žฌ์ธ    ๊ต์ฒด    
 6 ๋ฌธ์žฌ์ธ    ๋ถˆ๋น„๋ถˆ๋ช…
 7 ๋ฌธ์žฌ์ธ    ๊ณ ์‚ฌ    
 8 ๋ฌธ์žฌ์ธ    ๋‚จ์ชฝ    
 9 ๋ฌธ์žฌ์ธ    ์–ธ๋•    
10 ๋ฌธ์žฌ์ธ    ๋‚˜๋ญ‡๊ฐ€์ง€
# โ„น 3,828 more rows

4-1. ๋‹จ์–ด ๋นˆ๋„ ๊ณ„์‚ฐ

# ๋‹จ์–ด ๋นˆ๋„ ๊ตฌํ•˜๊ธฐ
frequency <- speeches %>%                    # ์ „์ฒ˜๋ฆฌ & ํ† ํฐํ™”๋ฅผ ์ˆ˜ํ–‰ํ•œ ๊ฒฐ๊ณผ๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด "speeches"
  count(president, word) %>%                 # ์—ฐ์„ค๋ฌธ ๊ฐ๊ฐ์˜ ๋‹จ์–ด ๋นˆ๋„ ๊ณ„์‚ฐ
  filter(str_count(word) > 1)                # ๋‘ ๊ธ€์ž ์ด์ƒ์˜ ๋‹จ์–ด๋งŒ ์ถ”์ถœ -> ํ•œ ๊ธ€์ž๋กœ ๋œ ๋‹จ์–ด ์ œ๊ฑฐ

frequency   
# A tibble: 1,513 ร— 3
   president word      n
   <chr>     <chr> <int>
 1 ๋…ธ๋ฌดํ˜„    ๊ฐ€์Šด      2
 2 ๋…ธ๋ฌดํ˜„    ๊ฐ€ํ›ˆ      2
 3 ๋…ธ๋ฌดํ˜„    ๊ฐˆ๋“ฑ      1
 4 ๋…ธ๋ฌดํ˜„    ๊ฐ์˜ฅ      1
 5 ๋…ธ๋ฌดํ˜„    ๊ฐ•์ž      1
 6 ๋…ธ๋ฌดํ˜„    ๊ฐœํŽธ      4
 7 ๋…ธ๋ฌดํ˜„    ๊ฐœํ˜      4
 8 ๋…ธ๋ฌดํ˜„    ๊ฑด๊ตญ      1
 9 ๋…ธ๋ฌดํ˜„    ๊ฒฝ์„       1
10 ๋…ธ๋ฌดํ˜„    ๊ฒฝ์Ÿ      1
# โ„น 1,503 more rows

4-2. TF-IDF ๊ณ„์‚ฐ

frequency <- frequency %>%           # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 4-1
  bind_tf_idf(term = word,           # ๋‹จ์–ด๊ฐ€ ์ž…๋ ฅ๋˜์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              document = president,  # ํ…์ŠคํŠธ ๊ตฌ๋ถ„ ๋ณ€์ˆ˜
              n = n) %>%             # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ž…๋ ฅ๋˜์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
  arrange(desc(tf_idf))              # TF-IDF ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ

frequency
# A tibble: 1,513 ร— 6
   president word         n      tf   idf tf_idf
   <chr>     <chr>    <int>   <dbl> <dbl>  <dbl>
 1 ๋…ธ๋ฌดํ˜„    ๊ณต์‹         6 0.0163  1.39  0.0227
 2 ๋…ธ๋ฌดํ˜„    ๋น„์ ผ         6 0.0163  1.39  0.0227
 3 ๋…ธ๋ฌดํ˜„    ์ •๊ณ„         6 0.0163  1.39  0.0227
 4 ์ด๋ช…๋ฐ•    ๋ฆฌ๋”์‹ญ       6 0.0158  1.39  0.0219
 5 ๋…ธ๋ฌดํ˜„    ๊ถŒ๋ ฅ         9 0.0245  0.693 0.0170
 6 ๋…ธ๋ฌดํ˜„    ๊ฐœํŽธ         4 0.0109  1.39  0.0151
 7 ์ด๋ช…๋ฐ•    ๋‹น์›         4 0.0105  1.39  0.0146
 8 ์ด๋ช…๋ฐ•    ๋™์ง€         4 0.0105  1.39  0.0146
 9 ์ด๋ช…๋ฐ•    ์ผ๋ฅ˜๊ตญ๊ฐ€     4 0.0105  1.39  0.0146
10 ๋ฐ•๊ทผํ˜œ    ๋ฐ•๊ทผํ˜œ       8 0.00962 1.39  0.0133
# โ„น 1,503 more rows

Result! ๊ฒฐ๊ณผ๋ฅผ ๋ณด๋ฉด ๋ณ€์ˆ˜ tf, idf, tf-idf๊ฐ€ ์ถ”๊ฐ€๋˜์—ˆ๋‹ค. ๋ณ€์ˆ˜ tf์— ์ž…๋ ฅ๋œ ๊ฐ’์€ ํ•ด๋‹น ๋ฌธ์„œ์—์„œ ๋ช‡ ๋ฒˆ ์‚ฌ์šฉํ•˜์˜€๋Š”์ง€๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ๋‹จ์ˆœ ๋นˆ๋„๊ฐ€ ์•„๋‹ˆ๋ผ ๋‹จ์–ด ์‚ฌ์šฉ ๋น„์œจ(โ€œํ•ด๋‹น ๋‹จ์–ด์˜ ๋นˆ๋„ ์ˆ˜/๋ชจ๋“  ๋‹จ์–ด์˜ ๋นˆ๋„ ํ•ฉโ€)์„ ์˜๋ฏธํ•œ๋‹ค.


Caution! TF-IDF๋ฅผ ์ด์šฉํ•˜๋ฉด ํ…์ŠคํŠธ์˜ ํŠน์ง•์„ ๋“œ๋Ÿฌ๋‚ด๋Š” ์ค‘์š”ํ•œ ๋‹จ์–ด๊ฐ€ ๋ฌด์—‡์ธ์ง€ ํŒŒ์•…ํ•  ์ˆ˜ ์žˆ๋‹ค. ๋ณ€์ˆ˜ tf-idf๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์‚ดํŽด๋ณด๋ฉด ๊ฐ ๋Œ€ํ†ต๋ น์ด ๋‹ค๋ฅธ ๋Œ€ํ†ต๋ น๋“ค๊ณผ ๋‹ฌ๋ฆฌ ๋ฌด์—‡์„ ๊ฐ•์กฐํ–ˆ๋Š”์ง€ ์•Œ ์ˆ˜ ์žˆ๋‹ค. ๋ฐ˜๋ฉด, TF-IDF๊ฐ€ ๋‚ฎ์€ ๋‹จ์–ด๋ฅผ ์‚ดํŽด๋ณด๋ฉด ์—ญ๋Œ€ ๋Œ€ํ†ต๋ น๋“ค์ด ๊ณตํ†ต์œผ๋กœ ์‚ฌ์šฉํ•œ ํ”ํ•œ ๋‹จ์–ด๊ฐ€ ๋ฌด์—‡์ธ์ง€ ํŒŒ์•…ํ•  ์ˆ˜ ์žˆ๋‹ค.

# ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ๋งŒ ์ถ”์ถœ
frequency %>% 
  filter(president == "๋ฌธ์žฌ์ธ") %>%
  arrange(desc(tf_idf))                   # TF-IDF๊ฐ€ ๋†’์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 688 ร— 6
   president word         n      tf   idf  tf_idf
   <chr>     <chr>    <int>   <dbl> <dbl>   <dbl>
 1 ๋ฌธ์žฌ์ธ    ๋ณต์ง€๊ตญ๊ฐ€     8 0.00608 1.39  0.00843
 2 ๋ฌธ์žฌ์ธ    ์—ฌ์„ฑ         6 0.00456 1.39  0.00633
 3 ๋ฌธ์žฌ์ธ    ๊ณตํ‰         5 0.00380 1.39  0.00527
 4 ๋ฌธ์žฌ์ธ    ๋‹ด์Ÿ์ด       5 0.00380 1.39  0.00527
 5 ๋ฌธ์žฌ์ธ    ๋Œ€ํ†ต๋ น์˜     5 0.00380 1.39  0.00527
 6 ๋ฌธ์žฌ์ธ    ๋ณดํ†ต         5 0.00380 1.39  0.00527
 7 ๋ฌธ์žฌ์ธ    ์ƒ์ƒ         5 0.00380 1.39  0.00527
 8 ๋ฌธ์žฌ์ธ    ์šฐ๋ฆฌ๋‚˜๋ผ    10 0.00760 0.693 0.00527
 9 ๋ฌธ์žฌ์ธ    ์ง€๋ฐฉ         5 0.00380 1.39  0.00527
10 ๋ฌธ์žฌ์ธ    ํ™•๋Œ€        10 0.00760 0.693 0.00527
# โ„น 678 more rows
frequency %>% 
  filter(president == "๋ฌธ์žฌ์ธ") %>%
  arrange(tf_idf)                         # TF-IDF๊ฐ€ ๋‚ฎ์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 688 ร— 6
   president word       n       tf   idf tf_idf
   <chr>     <chr>  <int>    <dbl> <dbl>  <dbl>
 1 ๋ฌธ์žฌ์ธ    ๊ฒฝ์Ÿ       6 0.00456      0      0
 2 ๋ฌธ์žฌ์ธ    ๊ฒฝ์ œ      15 0.0114       0      0
 3 ๋ฌธ์žฌ์ธ    ๊ณ ํ†ต       4 0.00304      0      0
 4 ๋ฌธ์žฌ์ธ    ๊ณผ๊ฑฐ       1 0.000760     0      0
 5 ๋ฌธ์žฌ์ธ    ๊ตญ๋ฏผ      21 0.0160       0      0
 6 ๋ฌธ์žฌ์ธ    ๊ธฐํšŒ       5 0.00380      0      0
 7 ๋ฌธ์žฌ์ธ    ๋Œ€ํ†ต๋ น    12 0.00913      0      0
 8 ๋ฌธ์žฌ์ธ    ๋™์•ˆ       2 0.00152      0      0
 9 ๋ฌธ์žฌ์ธ    ๋“ค์ด       9 0.00684      0      0
10 ๋ฌธ์žฌ์ธ    ๋งˆ์Œ       2 0.00152      0      0
# โ„น 678 more rows
# ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ๋งŒ ์ถ”์ถœ
frequency %>% 
  filter(president == "๋ฐ•๊ทผํ˜œ") %>%
  arrange(desc(tf_idf))                   # TF-IDF๊ฐ€ ๋†’์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 407 ร— 6
   president word         n      tf   idf  tf_idf
   <chr>     <chr>    <int>   <dbl> <dbl>   <dbl>
 1 ๋ฐ•๊ทผํ˜œ    ๋ฐ•๊ทผํ˜œ       8 0.00962 1.39  0.0133 
 2 ๋ฐ•๊ทผํ˜œ    ์ •๋ณด         5 0.00601 1.39  0.00833
 3 ๋ฐ•๊ทผํ˜œ    ํˆฌ๋ช…         5 0.00601 1.39  0.00833
 4 ๋ฐ•๊ทผํ˜œ    ํ–‰๋ณต        23 0.0276  0.288 0.00795
 5 ๋ฐ•๊ทผํ˜œ    ๊ต์œก         9 0.0108  0.693 0.00750
 6 ๋ฐ•๊ทผํ˜œ    ๊ตญ์ •์šด์˜     4 0.00481 1.39  0.00666
 7 ๋ฐ•๊ทผํ˜œ    ์ •๋ถ€        17 0.0204  0.288 0.00588
 8 ๋ฐ•๊ทผํ˜œ    ๊ฐœ๊ฐœ์ธ       3 0.00361 1.39  0.00500
 9 ๋ฐ•๊ทผํ˜œ    ๊ฐœ์ธ         3 0.00361 1.39  0.00500
10 ๋ฐ•๊ทผํ˜œ    ๊ณต๊ฐœ         3 0.00361 1.39  0.00500
# โ„น 397 more rows
frequency %>% 
  filter(president == "๋ฐ•๊ทผํ˜œ") %>%
  arrange(tf_idf)                         # TF-IDF๊ฐ€ ๋‚ฎ์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 407 ร— 6
   president word       n      tf   idf tf_idf
   <chr>     <chr>  <int>   <dbl> <dbl>  <dbl>
 1 ๋ฐ•๊ทผํ˜œ    ๊ฒฝ์Ÿ       1 0.00120     0      0
 2 ๋ฐ•๊ทผํ˜œ    ๊ฒฝ์ œ      15 0.0180      0      0
 3 ๋ฐ•๊ทผํ˜œ    ๊ณ ํ†ต       4 0.00481     0      0
 4 ๋ฐ•๊ทผํ˜œ    ๊ณผ๊ฑฐ       2 0.00240     0      0
 5 ๋ฐ•๊ทผํ˜œ    ๊ตญ๋ฏผ      72 0.0865      0      0
 6 ๋ฐ•๊ทผํ˜œ    ๊ธฐํšŒ       1 0.00120     0      0
 7 ๋ฐ•๊ทผํ˜œ    ๋Œ€ํ†ต๋ น     3 0.00361     0      0
 8 ๋ฐ•๊ทผํ˜œ    ๋™์•ˆ       3 0.00361     0      0
 9 ๋ฐ•๊ทผํ˜œ    ๋“ค์ด       3 0.00361     0      0
10 ๋ฐ•๊ทผํ˜œ    ๋งˆ์Œ       3 0.00361     0      0
# โ„น 397 more rows
# ์ด๋ช…๋ฐ• ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ๋งŒ ์ถ”์ถœ
frequency %>% 
  filter(president == "์ด๋ช…๋ฐ•") %>%
  arrange(desc(tf_idf))                   # TF-IDF๊ฐ€ ๋†’์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 202 ร— 6
   president word         n      tf   idf  tf_idf
   <chr>     <chr>    <int>   <dbl> <dbl>   <dbl>
 1 ์ด๋ช…๋ฐ•    ๋ฆฌ๋”์‹ญ       6 0.0158  1.39  0.0219 
 2 ์ด๋ช…๋ฐ•    ๋‹น์›         4 0.0105  1.39  0.0146 
 3 ์ด๋ช…๋ฐ•    ๋™์ง€         4 0.0105  1.39  0.0146 
 4 ์ด๋ช…๋ฐ•    ์ผ๋ฅ˜๊ตญ๊ฐ€     4 0.0105  1.39  0.0146 
 5 ์ด๋ช…๋ฐ•    ํ•œ๋‚˜๋ผ       7 0.0184  0.693 0.0128 
 6 ์ด๋ช…๋ฐ•    ๋‚˜๋ผ        15 0.0395  0.288 0.0114 
 7 ์ด๋ช…๋ฐ•    ๋„์•ฝ         3 0.00789 1.39  0.0109 
 8 ์ด๋ช…๋ฐ•    ์ผํ•˜         3 0.00789 1.39  0.0109 
 9 ์ด๋ช…๋ฐ•    ์‚ฌ๋ž‘         5 0.0132  0.693 0.00912
10 ์ด๋ช…๋ฐ•    ์ธ์ƒ         5 0.0132  0.693 0.00912
# โ„น 192 more rows
frequency %>% 
  filter(president == "์ด๋ช…๋ฐ•") %>%
  arrange(tf_idf)                         # TF-IDF๊ฐ€ ๋‚ฎ์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 202 ร— 6
   president word       n      tf   idf tf_idf
   <chr>     <chr>  <int>   <dbl> <dbl>  <dbl>
 1 ์ด๋ช…๋ฐ•    ๊ฒฝ์Ÿ       3 0.00789     0      0
 2 ์ด๋ช…๋ฐ•    ๊ฒฝ์ œ       5 0.0132      0      0
 3 ์ด๋ช…๋ฐ•    ๊ณ ํ†ต       1 0.00263     0      0
 4 ์ด๋ช…๋ฐ•    ๊ณผ๊ฑฐ       1 0.00263     0      0
 5 ์ด๋ช…๋ฐ•    ๊ตญ๋ฏผ      13 0.0342      0      0
 6 ์ด๋ช…๋ฐ•    ๊ธฐํšŒ       3 0.00789     0      0
 7 ์ด๋ช…๋ฐ•    ๋Œ€ํ†ต๋ น     4 0.0105      0      0
 8 ์ด๋ช…๋ฐ•    ๋™์•ˆ       1 0.00263     0      0
 9 ์ด๋ช…๋ฐ•    ๋“ค์ด       1 0.00263     0      0
10 ์ด๋ช…๋ฐ•    ๋งˆ์Œ       1 0.00263     0      0
# โ„น 192 more rows
# ๋…ธ๋ฌดํ˜„ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ๋งŒ ์ถ”์ถœ
frequency %>% 
  filter(president == "๋…ธ๋ฌดํ˜„") %>%
  arrange(desc(tf_idf))                   # TF-IDF๊ฐ€ ๋†’์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 216 ร— 6
   president word         n      tf   idf  tf_idf
   <chr>     <chr>    <int>   <dbl> <dbl>   <dbl>
 1 ๋…ธ๋ฌดํ˜„    ๊ณต์‹         6 0.0163  1.39  0.0227 
 2 ๋…ธ๋ฌดํ˜„    ๋น„์ ผ         6 0.0163  1.39  0.0227 
 3 ๋…ธ๋ฌดํ˜„    ์ •๊ณ„         6 0.0163  1.39  0.0227 
 4 ๋…ธ๋ฌดํ˜„    ๊ถŒ๋ ฅ         9 0.0245  0.693 0.0170 
 5 ๋…ธ๋ฌดํ˜„    ๊ฐœํŽธ         4 0.0109  1.39  0.0151 
 6 ๋…ธ๋ฌดํ˜„    ๊ตญํšŒ์˜์›     3 0.00817 1.39  0.0113 
 7 ๋…ธ๋ฌดํ˜„    ๋‚จ๋ถ๋Œ€ํ™”     3 0.00817 1.39  0.0113 
 8 ๋…ธ๋ฌดํ˜„    ์ด๋ฆฌ         3 0.00817 1.39  0.0113 
 9 ๋…ธ๋ฌดํ˜„    ๊ฐ€ํ›ˆ         2 0.00545 1.39  0.00755
10 ๋…ธ๋ฌดํ˜„    ๊ฐœํ˜         4 0.0109  0.693 0.00755
# โ„น 206 more rows
frequency %>% 
  filter(president == "๋…ธ๋ฌดํ˜„") %>%
  arrange(tf_idf)                         # TF-IDF๊ฐ€ ๋‚ฎ์€ ๋‹จ์–ด์ˆœ์œผ๋กœ ์ •๋ ฌ
# A tibble: 216 ร— 6
   president word       n      tf   idf tf_idf
   <chr>     <chr>  <int>   <dbl> <dbl>  <dbl>
 1 ๋…ธ๋ฌดํ˜„    ๊ฒฝ์Ÿ       1 0.00272     0      0
 2 ๋…ธ๋ฌดํ˜„    ๊ฒฝ์ œ       1 0.00272     0      0
 3 ๋…ธ๋ฌดํ˜„    ๊ณ ํ†ต       1 0.00272     0      0
 4 ๋…ธ๋ฌดํ˜„    ๊ณผ๊ฑฐ       1 0.00272     0      0
 5 ๋…ธ๋ฌดํ˜„    ๊ตญ๋ฏผ       7 0.0191      0      0
 6 ๋…ธ๋ฌดํ˜„    ๊ธฐํšŒ       1 0.00272     0      0
 7 ๋…ธ๋ฌดํ˜„    ๋Œ€ํ†ต๋ น     6 0.0163      0      0
 8 ๋…ธ๋ฌดํ˜„    ๋™์•ˆ       2 0.00545     0      0
 9 ๋…ธ๋ฌดํ˜„    ๋“ค์ด       4 0.0109      0      0
10 ๋…ธ๋ฌดํ˜„    ๋งˆ์Œ       1 0.00272     0      0
# โ„น 206 more rows

4-3. ์‹œ๊ฐํ™”

# 1. ์ฃผ์š” ๋‹จ์–ด ์ถ”์ถœ (TF-IDF๊ฐ€ ๋†’์€ ์ƒ์œ„ ๋‹จ์–ด ์ถ”์ถœ)
top10 <- frequency %>%                    # TF-IDF๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด in 4-2
  group_by(president) %>%                 # ๋ณ€์ˆ˜ president์— ๋Œ€ํ•ด ๊ทธ๋ฃนํ™” -> ๊ฐ๊ฐ์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ ์œ„ํ•ด ์ˆ˜ํ–‰
  slice_max(tf_idf, 
            n = 10,                       # TF-IDF๊ฐ€ ๋†’์€ ์ƒ์œ„ 10๊ฐœ ๋‹จ์–ด ์ถ”์ถœ
            with_ties = F)                # ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์˜ต์…˜ n์— ์ง€์ •ํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด ์ถ”์ถœ

top10
# A tibble: 40 ร— 6
# Groups:   president [4]
   president word         n      tf   idf  tf_idf
   <chr>     <chr>    <int>   <dbl> <dbl>   <dbl>
 1 ๋…ธ๋ฌดํ˜„    ๊ณต์‹         6 0.0163  1.39  0.0227 
 2 ๋…ธ๋ฌดํ˜„    ๋น„์ ผ         6 0.0163  1.39  0.0227 
 3 ๋…ธ๋ฌดํ˜„    ์ •๊ณ„         6 0.0163  1.39  0.0227 
 4 ๋…ธ๋ฌดํ˜„    ๊ถŒ๋ ฅ         9 0.0245  0.693 0.0170 
 5 ๋…ธ๋ฌดํ˜„    ๊ฐœํŽธ         4 0.0109  1.39  0.0151 
 6 ๋…ธ๋ฌดํ˜„    ๊ตญํšŒ์˜์›     3 0.00817 1.39  0.0113 
 7 ๋…ธ๋ฌดํ˜„    ๋‚จ๋ถ๋Œ€ํ™”     3 0.00817 1.39  0.0113 
 8 ๋…ธ๋ฌดํ˜„    ์ด๋ฆฌ         3 0.00817 1.39  0.0113 
 9 ๋…ธ๋ฌดํ˜„    ๊ฐ€ํ›ˆ         2 0.00545 1.39  0.00755
10 ๋…ธ๋ฌดํ˜„    ๊ฐœํ˜         4 0.0109  0.693 0.00755
# โ„น 30 more rows
# 2. ๊ทธ๋ž˜ํ”„ ์ˆœ์„œ๋ฅผ ์ •ํ•˜๊ธฐ ์œ„ํ•œ Factor ๋ณ€ํ™˜
top10$president <- factor(top10$president,                                    # factor : ๋ฒ”์ฃผํ˜•์œผ๋กœ ๋ณ€ํ™˜
                          levels = c("๋ฌธ์žฌ์ธ", "๋ฐ•๊ทผํ˜œ", "์ด๋ช…๋ฐ•", "๋…ธ๋ฌดํ˜„")) # levels = ๊ทธ๋ž˜ํ”„ ์ˆœ์„œ

# 3. ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„
ggplot(top10, 
       aes(x = reorder_within(word, tf_idf, president),  # reorder_within : ํ•ญ๋ชฉ๋ณ„๋กœ ๋‹จ์–ด ๋นˆ๋„์ˆœ ์ •๋ ฌ
           y = tf_idf,          
            fill = president)) +                         # ๋Œ€ํ†ต๋ น์— ๋”ฐ๋ผ ๋ง‰๋Œ€ ์ƒ‰๊น”์„ ๋‹ค๋ฅด๊ฒŒ ํ‘œํ˜„  
  geom_col(show.legend = F) +                            # ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ 
  coord_flip() +                                         # ๋ง‰๋Œ€๋ฅผ ๊ฐ€๋กœ๋กœ ํšŒ์ „
  facet_wrap(~president,                                 # ๋ณ€์ˆ˜ president์˜ ํ•ญ๋ชฉ๋ณ„๋กœ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ -> ๋Œ€ํ†ต๋ น ๊ฐ๊ฐ์˜ ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ž‘์„ฑ
             scales = "free",                            # x์ถ•๊ณผ y์ถ• ํ†ต์ผ X
             ncol = 2) +                                 # ํ•œ ํ–‰์— ๋‚˜ํƒ€๋‚ผ ๊ทธ๋ž˜ํ”„ ๊ฐœ์ˆ˜
  scale_x_reordered()                                    # ๋‹จ์–ด ๋’ค์˜ ๋Œ€ํ†ต๋ น ์ด๋ฆ„ ์ œ๊ฑฐ

Result! ๊ทธ๋ž˜ํ”„๋ฅผ ๋ณด๋ฉด, ์—ญ๋Œ€ ๋Œ€ํ†ต๋ น์˜ ๊ฐœ์„ฑ์„ ๋“œ๋Ÿฌ๋‚ด๋Š” ๋‹จ์–ด๋ฅผ ํŒŒ์•…ํ•  ์ˆ˜ ์žˆ๋‹ค.
Caution! ๋ชจ๋“  ๋ฌธ์„œ์— ์‚ฌ์šฉ๋œ ๋‹จ์–ด๋Š” IDF๊ฐ€ 0์ด๋ฏ€๋กœ TF-IDF๋„ 0์ด ๋œ๋‹ค. ๋”ฐ๋ผ์„œ TF-IDF๋ฅผ ํ™œ์šฉํ•˜๋ฉด ์–ด๋–ค ๋‹จ์–ด๊ฐ€ ํŠน์ • ๋ฌธ์„œ์— ํŠน์ถœ๋‚˜๊ฒŒ ๋งŽ์ด ์‚ฌ์šฉ๋˜๋”๋ผ๋„ ๋ชจ๋“  ๋ฌธ์„œ์— ์‚ฌ์šฉ๋˜๋ฉด ๋ฐœ๊ฒฌํ•  ์ˆ˜ ์—†๋Š” ํ•œ๊ณ„๊ฐ€ ์žˆ๋‹ค. โ€œWeighted log oddsโ€๋ฅผ ํ™œ์šฉํ•˜๋ฉด ์ด๋Ÿฐ ํ•œ๊ณ„๋ฅผ ๊ทน๋ณตํ•  ์ˆ˜ ์žˆ๋‹ค. Weighted log odds๋Š” ๋‹จ์–ด ๋“ฑ์žฅ ํ™•๋ฅ ์„ ๊ฐ€์ค‘์น˜๋กœ ์ด์šฉํ•˜๊ธฐ ๋•Œ๋ฌธ์— ์–ด๋–ค ๋‹จ์–ด๊ฐ€ ๋ชจ๋“  ๋ฌธ์„œ์— ์‚ฌ์šฉ๋˜๋”๋ผ๋„ ํŠน์ • ๋ฌธ์„œ์— ๋งŽ์ด ์‚ฌ์šฉ๋˜๋ฉด ๋ฐœ๊ฒฌํ•  ์ˆ˜ ์žˆ๋‹ค. ๋˜ํ•œ, ์˜ค์ฆˆ๋น„์™€ ๋‹ฌ๋ฆฌ ์…‹ ์ด์ƒ์˜ ๋ฌธ์„œ๋ฅผ ๋น„๊ตํ•  ๋•Œ๋„ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ์žฅ์ ์ด ์žˆ๋‹ค. Weighted log odds๋Š” Package "tidylo"๋ฅผ ์ด์šฉํ•˜๋ฉด ์‰ฝ๊ฒŒ ๊ตฌํ•  ์ˆ˜ ์žˆ๋‹ค. ํ•ด๋‹น ํŒจํ‚ค์ง€์˜ ์ž์„ธํ•œ ์„ค๋ช…์€ ์—ฌ๊ธฐ๋ฅผ ์ฐธ๊ณ ํ•œ๋‹ค.


์š”์•ฝ

# 1. ๋‹จ์–ด ๋นˆ๋„ ๋น„๊ต
speeches <- bind_speeches %>%                # ๋‘ ์—ฐ์„ค๋ฌธ์˜ ์›๋ฌธ์„ ํ•˜๋‚˜์˜ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ๊ฒฐํ•ฉํ•œ Dataset
  # ์ „์ฒ˜๋ฆฌ
  mutate(value = str_replace_all(value,      
                                 "[^๊ฐ€-ํžฃ]", # [^๊ฐ€-ํžฃ] : ํ•œ๊ธ€์„ ์ œ์™ธํ•œ ๋ชจ๋“  ๋ฌธ์ž๋ฅผ ์˜๋ฏธํ•˜๋Š” ์ •๊ทœ ํ‘œํ˜„์‹
                                 " "),       # ๊ณต๋ฐฑ์œผ๋กœ ๋ณ€๊ฒฝ
         value = str_squish(value)) %>%      # ์—ฐ์†๋œ ๊ณต๋ฐฑ ์ œ๊ฑฐ
  # ํ† ํฐํ™”
  unnest_tokens(input = value,               # ํ† ํฐํ™”๋ฅผ ์ˆ˜ํ–‰ํ•  ํ…์ŠคํŠธ๊ฐ€ ํฌํ•จ๋œ ๋ณ€์ˆ˜๋ช…
                output = word,               # ์ถœ๋ ฅ ๋ณ€์ˆ˜๋ช…
                token = extractNoun)         # ๋ช…์‚ฌ ๊ธฐ์ค€์œผ๋กœ ํ† ํฐํ™”

speeches
# A tibble: 2,997 ร— 2
   president word      
   <chr>     <chr>     
 1 moon      "์ •๊ถŒ๊ต์ฒด"
 2 moon      "ํ•˜๊ฒ ์Šต๋‹ˆ"
 3 moon      "์ •์น˜"    
 4 moon      "๊ต์ฒด"    
 5 moon      "ํ•˜๊ฒ ์Šต๋‹ˆ"
 6 moon      "์‹œ๋Œ€"    
 7 moon      "๊ต์ฒด"    
 8 moon      "ํ•˜๊ฒ ์Šต๋‹ˆ"
 9 moon      ""        
10 moon      "๋ถˆ๋น„๋ถˆ๋ช…"
# โ„น 2,987 more rows
# ์—ฐ์„ค๋ฌธ๋ณ„ ๋‹จ์–ด ๋นˆ๋„ ๊ตฌํ•˜๊ธฐ
frequency <- speeches %>%
  count(president, word) %>%                  # ์—ฐ์„ค๋ฌธ ๊ฐ๊ฐ์˜ ๋‹จ์–ด ๋นˆ๋„ ๊ณ„์‚ฐ
  filter(str_count(word) > 1)                 # ๋‘ ๊ธ€์ž ์ด์ƒ์˜ ๋‹จ์–ด๋งŒ ์ถ”์ถœ -> ํ•œ ๊ธ€์ž๋กœ ๋œ ๋‹จ์–ด ์ œ๊ฑฐ

frequency
# A tibble: 1,131 ร— 3
   president word         n
   <chr>     <chr>    <int>
 1 moon      ๊ฐ€๋™         1
 2 moon      ๊ฐ€์‚ฌ         1
 3 moon      ๊ฐ€์Šด         2
 4 moon      ๊ฐ€์กฑ         1
 5 moon      ๊ฐ€์กฑ๊ตฌ์กฐ     1
 6 moon      ๊ฐ€์ง€         4
 7 moon      ๊ฐ€์น˜         3
 8 moon      ๊ฐ์ข…         1
 9 moon      ๊ฐ๋‹น         1
10 moon      ๊ฐ•๋ ฅ         3
# โ„น 1,121 more rows
# ๊ฐ€์žฅ ๋งŽ์ด ์‚ฌ์šฉ๋œ ๋‹จ์–ด ์ถ”์ถœ
top10 <- frequency %>%
  group_by(president) %>%                    # ๋ณ€์ˆ˜ president์— ๋Œ€ํ•ด ๊ทธ๋ฃนํ™” -> ๊ฐ๊ฐ์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ ์œ„ํ•ด ์ˆ˜ํ–‰
  slice_max(n, n = 10,                       # ๋นˆ๋„๊ฐ€ ๊ฐ€์žฅ ๋†’์€ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ 
            with_ties = F)                   # ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์˜ต์…˜ n์— ์ง€์ •ํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด ์ถ”์ถœ

top10
# A tibble: 20 ร— 3
# Groups:   president [2]
   president word       n
   <chr>     <chr>  <int>
 1 moon      ๊ตญ๋ฏผ      21
 2 moon      ์ผ์ž๋ฆฌ    21
 3 moon      ๋‚˜๋ผ      19
 4 moon      ์šฐ๋ฆฌ      17
 5 moon      ๊ฒฝ์ œ      15
 6 moon      ์‚ฌํšŒ      14
 7 moon      ์„ฑ์žฅ      13
 8 moon      ๋Œ€ํ†ต๋ น    12
 9 moon      ์ •์น˜      12
10 moon      ํ•˜๊ฒŒ      12
11 park      ๊ตญ๋ฏผ      72
12 park      ํ–‰๋ณต      23
13 park      ์—ฌ๋Ÿฌ๋ถ„    20
14 park      ์ •๋ถ€      17
15 park      ๊ฒฝ์ œ      15
16 park      ์‹ ๋ขฐ      11
17 park      ๊ตญ๊ฐ€      10
18 park      ์šฐ๋ฆฌ      10
19 park      ๊ต์œก       9
20 park      ์‚ฌ๋žŒ       9
# 2. ์˜ค์ฆˆ๋น„
# Wide Form ํ˜•ํƒœ๋กœ ๋ณ€ํ™˜
frequency_wide <- frequency %>%
  pivot_wider(names_from = president,               # ๋ณ€์ˆ˜๋ช…์œผ๋กœ ์ž…๋ ฅํ•  ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              values_from = n,                      # ๋ณ€์ˆ˜์— ์ฑ„์›Œ ๋„ฃ์„ ๊ฐ’์ด ๋“ค์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              values_fill = list(n = 0))            # ๊ฒฐ์ธก์น˜ NA๋ฅผ 0์œผ๋กœ ๋Œ€์ฒด

frequency_wide
# A tibble: 955 ร— 3
   word      moon  park
   <chr>    <int> <int>
 1 ๊ฐ€๋™         1     0
 2 ๊ฐ€์‚ฌ         1     0
 3 ๊ฐ€์Šด         2     0
 4 ๊ฐ€์กฑ         1     1
 5 ๊ฐ€์กฑ๊ตฌ์กฐ     1     0
 6 ๊ฐ€์ง€         4     0
 7 ๊ฐ€์น˜         3     1
 8 ๊ฐ์ข…         1     0
 9 ๊ฐ๋‹น         1     0
10 ๊ฐ•๋ ฅ         3     0
# โ„น 945 more rows
# ์˜ค์ฆˆ๋น„/๋กœ๊ทธ ์˜ค์ฆˆ๋น„ ๊ณ„์‚ฐ
frequency_wide <- frequency_wide %>%
  mutate(ratio_moon = ((moon + 1)/(sum(moon + 1))),  # ๋ฌธ์žฌ์ธ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋‹จ์–ด์˜ ๋น„์ค‘ ๊ณ„์‚ฐ
         ratio_park = ((park + 1)/(sum(park + 1))),  # ๋ฐ•๊ทผํ˜œ ์ „ ๋Œ€ํ†ต๋ น์˜ ์—ฐ์„ค๋ฌธ์—์„œ ๋‹จ์–ด์˜ ๋น„์ค‘ ๊ณ„์‚ฐ
         odds_ratio = ratio_moon/ratio_park,         # ์˜ค์ฆˆ๋น„ ๊ณ„์‚ฐ
         log_odds_ratio = log(odds_ratio))           # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„ ๊ณ„์‚ฐ

frequency_wide
# A tibble: 955 ร— 7
   word     moon  park ratio_moon ratio_park odds_ratio log_odds_ratio
   <chr>   <int> <int>      <dbl>      <dbl>      <dbl>          <dbl>
 1 ๊ฐ€๋™        1     0   0.000873   0.000552      1.58           0.459
 2 ๊ฐ€์‚ฌ        1     0   0.000873   0.000552      1.58           0.459
 3 ๊ฐ€์Šด        2     0   0.00131    0.000552      2.37           0.865
 4 ๊ฐ€์กฑ        1     1   0.000873   0.00110       0.791         -0.234
 5 ๊ฐ€์กฑ๊ตฌโ€ฆ     1     0   0.000873   0.000552      1.58           0.459
 6 ๊ฐ€์ง€        4     0   0.00218    0.000552      3.96           1.38 
 7 ๊ฐ€์น˜        3     1   0.00175    0.00110       1.58           0.459
 8 ๊ฐ์ข…        1     0   0.000873   0.000552      1.58           0.459
 9 ๊ฐ๋‹น        1     0   0.000873   0.000552      1.58           0.459
10 ๊ฐ•๋ ฅ        3     0   0.00175    0.000552      3.17           1.15 
# โ„น 945 more rows
# ๋กœ๊ทธ ์˜ค์ฆˆ๋น„๋ฅผ ๊ธฐ์ค€์œผ๋กœ ์ƒ๋Œ€์ ์œผ๋กœ ์ค‘์š”ํ•œ ๋‹จ์–ด ์ถ”์ถœ
top10 <- frequency_wide %>%
  group_by(president = ifelse(log_odds_ratio > 0, "moon", "park")) %>%  # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„๊ฐ€ ์–‘์ˆ˜์ด๋ฉด ๋ณ€์ˆ˜ president์— "moon", ๊ทธ๋ ‡์ง€ ์•Š์œผ๋ฉด "park" ํ• ๋‹นํ•œ ํ›„ ๊ทธ๋ฃนํ™”
  slice_max(abs(log_odds_ratio), n = 10,                                # ๋กœ๊ทธ ์˜ค์ฆˆ๋น„์˜ ์ ˆ๋Œ“๊ฐ’ ๊ธฐ์ค€์œผ๋กœ ์ƒ์œ„ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ -> ์•ž์—์„œ ๊ทธ๋ฃนํ™”๋ฅผ ์ˆ˜ํ–‰ํ–ˆ๊ธฐ ๋•Œ๋ฌธ์— ๊ฐ ์—ฐ์„ค๋ฌธ์—์„œ ์ƒ์œ„ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ
            with_ties = F)                                              # ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์˜ต์…˜ n์— ์ง€์ •ํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด ์ถ”์ถœ

top10
# A tibble: 20 ร— 8
# Groups:   president [2]
   word     moon  park ratio_moon ratio_park odds_ratio log_odds_ratio
   <chr>   <int> <int>      <dbl>      <dbl>      <dbl>          <dbl>
 1 ๋ณต์ง€๊ตญโ€ฆ     8     0   0.00393    0.000552     7.12             1.96
 2 ์„ธ์ƒ        6     0   0.00306    0.000552     5.54             1.71
 3 ์—ฌ์„ฑ        6     0   0.00306    0.000552     5.54             1.71
 4 ์ •์˜        6     0   0.00306    0.000552     5.54             1.71
 5 ๊ฐ•์ž        5     0   0.00262    0.000552     4.75             1.56
 6 ๊ณตํ‰        5     0   0.00262    0.000552     4.75             1.56
 7 ๋Œ€ํ†ต๋ นโ€ฆ     5     0   0.00262    0.000552     4.75             1.56
 8 ๋ณดํ†ต        5     0   0.00262    0.000552     4.75             1.56
 9 ์ƒ์ƒ        5     0   0.00262    0.000552     4.75             1.56
10 ์ง€๋ฐฉ        5     0   0.00262    0.000552     4.75             1.56
11 ๋ฐ•๊ทผํ˜œ      0     8   0.000436   0.00496      0.0879          -2.43
12 ์—ฌ๋Ÿฌ๋ถ„      2    20   0.00131    0.0116       0.113           -2.18
13 ํ–‰๋ณต        3    23   0.00175    0.0132       0.132           -2.03
14 ์‹ค์ฒœ        0     5   0.000436   0.00331      0.132           -2.03
15 ์ •๋ณด        0     5   0.000436   0.00331      0.132           -2.03
16 ํˆฌ๋ช…        0     5   0.000436   0.00331      0.132           -2.03
17 ๊ณผ์ œ        0     4   0.000436   0.00276      0.158           -1.84
18 ๊ตญ์ •์šดโ€ฆ     0     4   0.000436   0.00276      0.158           -1.84
19 ์‹œ์ž‘        0     4   0.000436   0.00276      0.158           -1.84
20 ์ง€์‹        0     4   0.000436   0.00276      0.158           -1.84
# โ„น 1 more variable: president <chr>
# 3. TF-IDF ๊ณ„์‚ฐ
frequency <- frequency %>%           # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ €์žฅ๋˜์–ด ์žˆ๋Š” ๊ฐ์ฒด  
  bind_tf_idf(term = word,           # ๋‹จ์–ด๊ฐ€ ์ž…๋ ฅ๋˜์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
              document = president,  # ํ…์ŠคํŠธ ๊ตฌ๋ถ„ ๋ณ€์ˆ˜
              n = n) %>%             # ๋‹จ์–ด ๋นˆ๋„๊ฐ€ ์ž…๋ ฅ๋˜์–ด ์žˆ๋Š” ๋ณ€์ˆ˜
  arrange(desc(tf_idf))              # TF-IDF ๋‚ด๋ฆผ์ฐจ์ˆœ ์ •๋ ฌ

frequency
# A tibble: 1,131 ร— 6
   president word         n      tf   idf  tf_idf
   <chr>     <chr>    <int>   <dbl> <dbl>   <dbl>
 1 park      ๋ฐ•๊ทผํ˜œ       8 0.00932 0.693 0.00646
 2 moon      ๋ณต์ง€๊ตญ๊ฐ€     8 0.00599 0.693 0.00415
 3 park      ์‹ค์ฒœ         5 0.00583 0.693 0.00404
 4 park      ์ •๋ณด         5 0.00583 0.693 0.00404
 5 park      ํˆฌ๋ช…         5 0.00583 0.693 0.00404
 6 park      ๊ณผ์ œ         4 0.00466 0.693 0.00323
 7 park      ๊ตญ์ •์šด์˜     4 0.00466 0.693 0.00323
 8 park      ์‹œ์ž‘         4 0.00466 0.693 0.00323
 9 park      ์ง€์‹         4 0.00466 0.693 0.00323
10 moon      ์„ธ์ƒ         6 0.00449 0.693 0.00311
# โ„น 1,121 more rows
# TF-IDF ๊ธฐ์ค€์œผ๋กœ ์ƒ๋Œ€์ ์œผ๋กœ ์ค‘์š”ํ•œ ๋‹จ์–ด ์ถ”์ถœ
top10 <- frequency %>%
  group_by(president) %>%            # ๋ณ€์ˆ˜ president์— ๋Œ€ํ•ด ๊ทธ๋ฃนํ™”
  slice_max(n, n = 10,               # TF-IDF๊ฐ€ ๊ฐ€์žฅ ๋†’์€ 10๊ฐœ์˜ ๋‹จ์–ด ์ถ”์ถœ 
            with_ties = F)           # ๋นˆ๋„๊ฐ€ ๋™์ผํ•˜๋”๋ผ๋„ ์˜ต์…˜ n์— ์ง€์ •ํ•œ ๊ฐœ์ˆ˜๋งŒํผ๋งŒ ๋‹จ์–ด ์ถ”์ถœ

top10
# A tibble: 20 ร— 6
# Groups:   president [2]
   president word       n      tf   idf tf_idf
   <chr>     <chr>  <int>   <dbl> <dbl>  <dbl>
 1 moon      ๊ตญ๋ฏผ      21 0.0157      0      0
 2 moon      ์ผ์ž๋ฆฌ    21 0.0157      0      0
 3 moon      ๋‚˜๋ผ      19 0.0142      0      0
 4 moon      ์šฐ๋ฆฌ      17 0.0127      0      0
 5 moon      ๊ฒฝ์ œ      15 0.0112      0      0
 6 moon      ์‚ฌํšŒ      14 0.0105      0      0
 7 moon      ์„ฑ์žฅ      13 0.00973     0      0
 8 moon      ๋Œ€ํ†ต๋ น    12 0.00898     0      0
 9 moon      ์ •์น˜      12 0.00898     0      0
10 moon      ํ•˜๊ฒŒ      12 0.00898     0      0
11 park      ๊ตญ๋ฏผ      72 0.0839      0      0
12 park      ํ–‰๋ณต      23 0.0268      0      0
13 park      ์—ฌ๋Ÿฌ๋ถ„    20 0.0233      0      0
14 park      ์ •๋ถ€      17 0.0198      0      0
15 park      ๊ฒฝ์ œ      15 0.0175      0      0
16 park      ์‹ ๋ขฐ      11 0.0128      0      0
17 park      ๊ตญ๊ฐ€      10 0.0117      0      0
18 park      ์šฐ๋ฆฌ      10 0.0117      0      0
19 park      ๊ต์œก       9 0.0105      0      0
20 park      ์‚ฌ๋žŒ       9 0.0105      0      0

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".