Morphological Analysis For English in Text Mining
문장을 구성하는
단어는 고유한 문법적 기능
을 수행하며,문법적 성질의 공통성
에 따라 몇 갈래로 묶어 놓은 것을품사
라고 한다. 예를 들어,대명사
,명사
,동사
,형용사
,부사
등이 있다. 이렇게 단어에 품사를 붙여주는 작업을 품사분석(Part-Of-Speech), POS분석이라고 한다.
영어에서는 텍스트 객체에 대해 먼저
문장 단위 주석작업
을 실시한 후, 해당문장에서각 단어가 어떤 문법적 기능
을 수행하는지 품사분석을 실시한다. 예시로 R을 소개하는 위키피디아 두 문단의 텍스트에 대해 POS분석을 실시해보았다.
pacman::p_load("NLP", "openNLP",
"tm", "stringr")
R.wiki <- "R is a programming language and software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing.
The R language is widely used among statisticians and data miners for developing statistical software and data analysis.
Polls, surveys of data miners, and studies of scholarly literature databases show that R's popularity has increased substantially in recent years.
R is a GNU package.
The source code for the R software environment is written primarily in C, Fortran, and R.
R is freely available under the GNU General Public License, and pre-compiled binary versions are provided for various operating systems.
While R has a command line interface, there are several graphical front-ends available."
annotate
: 주석 작업Maxent_Sent_Token_Annotator()
: 문장 단위의 주석작업 실시R.wiki.sent <- annotate(R.wiki,Maxent_Sent_Token_Annotator()) # annotate : 주석 작업 / Maxent_Sent_Token_Annotator : 문장 단위의 주석작업 실시NAR.wiki.sent
id type start end features
1 sentence 1 148
2 sentence 162 281
3 sentence 295 440
4 sentence 453 471
5 sentence 485 722
6 sentence 736 822
annotate(object, Maxent_Word_Token_Annotator(), sent.result)
Maxent_Word_Token_Annotator()
: 단어 단위의 주석작업 실시sent.result
: 문장 단위가 어떻게 주석작업 되었는지 나타내는 변수로, Maxent_Sent_Token_Annotator()
의 결과R.wiki.word <- annotate(R.wiki, Maxent_Word_Token_Annotator(), # Maxent_Word_Token_Annotator : 단어 단위의 주석작업 실시NAR.wiki.sent) # 문장 단위가 어떻게 주석작업 되었는지 나타내는 변수 변수
R.wiki.word
id type start end features
1 sentence 1 148 constituents=<<integer,22>>
2 sentence 162 281 constituents=<<integer,19>>
3 sentence 295 440 constituents=<<integer,25>>
4 sentence 453 471 constituents=<<integer,6>>
5 sentence 485 722 constituents=<<integer,40>>
6 sentence 736 822 constituents=<<integer,15>>
7 word 1 1
8 word 3 4
9 word 6 6
10 word 8 18
11 word 20 27
12 word 29 31
13 word 33 40
14 word 42 52
15 word 54 56
16 word 58 68
17 word 70 78
18 word 80 82
19 word 84 91
20 word 93 101
21 word 103 104
22 word 106 108
23 word 110 110
24 word 112 121
25 word 123 125
26 word 127 137
27 word 139 147
28 word 148 148
29 word 162 164
30 word 166 166
31 word 168 175
32 word 177 178
33 word 180 185
34 word 187 190
35 word 192 196
36 word 198 210
37 word 212 214
38 word 216 219
39 word 221 226
40 word 228 230
41 word 232 241
42 word 243 253
43 word 255 262
44 word 264 266
45 word 268 271
46 word 273 280
47 word 281 281
48 word 295 299
49 word 300 300
50 word 302 308
51 word 310 311
52 word 313 316
53 word 318 323
54 word 324 324
55 word 326 328
56 word 330 336
57 word 338 339
58 word 341 349
59 word 351 360
60 word 362 370
61 word 372 375
62 word 377 380
63 word 382 382
64 word 383 384
65 word 386 395
66 word 397 399
67 word 401 409
68 word 411 423
69 word 425 426
70 word 428 433
71 word 435 439
72 word 440 440
73 word 453 453
74 word 455 456
75 word 458 458
76 word 460 462
77 word 464 470
78 word 471 471
79 word 485 487
80 word 489 494
81 word 496 499
82 word 501 503
83 word 505 507
84 word 509 509
85 word 511 518
86 word 520 530
87 word 532 533
88 word 535 541
89 word 543 551
90 word 553 554
91 word 556 556
92 word 557 557
93 word 559 565
94 word 566 566
95 word 568 570
96 word 572 573
97 word 587 587
98 word 589 590
99 word 592 597
100 word 599 607
101 word 609 613
102 word 615 617
103 word 619 621
104 word 623 629
105 word 631 636
106 word 638 644
107 word 645 645
108 word 647 649
109 word 651 662
110 word 664 669
111 word 671 678
112 word 680 682
113 word 684 691
114 word 693 695
115 word 697 703
116 word 705 713
117 word 715 721
118 word 722 722
119 word 736 740
120 word 742 742
121 word 744 746
122 word 748 748
123 word 750 756
124 word 758 761
125 word 763 771
126 word 772 772
127 word 774 778
128 word 780 782
129 word 784 790
130 word 792 800
131 word 802 811
132 word 813 821
133 word 822 822
annotate(object, Maxent_POS_Tag_Annotator(), word.result)
Maxent_POS_Tag_Annotator()
: 품사분석 실시word.result
: 단어 단위가 어떻게 주석작업 되었는지 나타내는 변수로, Maxent_Word_Token_Annotator()
의 결과POStag <- annotate(R.wiki, Maxent_POS_Tag_Annotator(), R.wiki.word) # Maxent_POS_Tag_Annotator : 품사분석석
POStag
id type start end features
1 sentence 1 148 constituents=<<integer,22>>
2 sentence 162 281 constituents=<<integer,19>>
3 sentence 295 440 constituents=<<integer,25>>
4 sentence 453 471 constituents=<<integer,6>>
5 sentence 485 722 constituents=<<integer,40>>
6 sentence 736 822 constituents=<<integer,15>>
7 word 1 1 POS=NN
8 word 3 4 POS=VBZ
9 word 6 6 POS=DT
10 word 8 18 POS=NN
11 word 20 27 POS=NN
12 word 29 31 POS=CC
13 word 33 40 POS=NN
14 word 42 52 POS=NN
15 word 54 56 POS=IN
16 word 58 68 POS=JJ
17 word 70 78 POS=NN
18 word 80 82 POS=CC
19 word 84 91 POS=NNS
20 word 93 101 POS=VBN
21 word 103 104 POS=IN
22 word 106 108 POS=DT
23 word 110 110 POS=NN
24 word 112 121 POS=NNP
25 word 123 125 POS=IN
26 word 127 137 POS=NNP
27 word 139 147 POS=NNP
28 word 148 148 POS=.
29 word 162 164 POS=DT
30 word 166 166 POS=NN
31 word 168 175 POS=NN
32 word 177 178 POS=VBZ
33 word 180 185 POS=RB
34 word 187 190 POS=VBN
35 word 192 196 POS=IN
36 word 198 210 POS=NNS
37 word 212 214 POS=CC
38 word 216 219 POS=NNS
39 word 221 226 POS=NNS
40 word 228 230 POS=IN
41 word 232 241 POS=VBG
42 word 243 253 POS=JJ
43 word 255 262 POS=NN
44 word 264 266 POS=CC
45 word 268 271 POS=NNS
46 word 273 280 POS=NN
47 word 281 281 POS=.
48 word 295 299 POS=NNS
49 word 300 300 POS=,
50 word 302 308 POS=NNS
51 word 310 311 POS=IN
52 word 313 316 POS=NNS
53 word 318 323 POS=NNS
54 word 324 324 POS=,
55 word 326 328 POS=CC
56 word 330 336 POS=NNS
57 word 338 339 POS=IN
58 word 341 349 POS=JJ
59 word 351 360 POS=NN
60 word 362 370 POS=NNS
61 word 372 375 POS=VBP
62 word 377 380 POS=IN
63 word 382 382 POS=NN
64 word 383 384 POS=POS
65 word 386 395 POS=NN
66 word 397 399 POS=VBZ
67 word 401 409 POS=VBN
68 word 411 423 POS=RB
69 word 425 426 POS=IN
70 word 428 433 POS=JJ
71 word 435 439 POS=NNS
72 word 440 440 POS=.
73 word 453 453 POS=NN
74 word 455 456 POS=VBZ
75 word 458 458 POS=DT
76 word 460 462 POS=NNP
77 word 464 470 POS=NN
78 word 471 471 POS=.
79 word 485 487 POS=DT
80 word 489 494 POS=NN
81 word 496 499 POS=NN
82 word 501 503 POS=IN
83 word 505 507 POS=DT
84 word 509 509 POS=NN
85 word 511 518 POS=NN
86 word 520 530 POS=NN
87 word 532 533 POS=VBZ
88 word 535 541 POS=VBN
89 word 543 551 POS=RB
90 word 553 554 POS=IN
91 word 556 556 POS=NNP
92 word 557 557 POS=,
93 word 559 565 POS=NNP
94 word 566 566 POS=,
95 word 568 570 POS=CC
96 word 572 573 POS=NNP
97 word 587 587 POS=NN
98 word 589 590 POS=VBZ
99 word 592 597 POS=RB
100 word 599 607 POS=JJ
101 word 609 613 POS=IN
102 word 615 617 POS=DT
103 word 619 621 POS=NNP
104 word 623 629 POS=NNP
105 word 631 636 POS=NNP
106 word 638 644 POS=NNP
107 word 645 645 POS=,
108 word 647 649 POS=CC
109 word 651 662 POS=JJ
110 word 664 669 POS=JJ
111 word 671 678 POS=NNS
112 word 680 682 POS=VBP
113 word 684 691 POS=VBN
114 word 693 695 POS=IN
115 word 697 703 POS=JJ
116 word 705 713 POS=VBG
117 word 715 721 POS=NNS
118 word 722 722 POS=.
119 word 736 740 POS=IN
120 word 742 742 POS=NN
121 word 744 746 POS=VBZ
122 word 748 748 POS=DT
123 word 750 756 POS=NN
124 word 758 761 POS=NN
125 word 763 771 POS=NN
126 word 772 772 POS=,
127 word 774 778 POS=EX
128 word 780 782 POS=VBP
129 word 784 790 POS=JJ
130 word 792 800 POS=JJ
131 word 802 811 POS=NNS
132 word 813 821 POS=JJ
133 word 822 822 POS=.
features
에 품사분석의 결과가 나타남
NN
: 명사VBZ
: 3인칭 현재형 단수 동사# 품사 Tagging된 단어의 갯수
word.start <- 1 + length(R.wiki.sent) # 문장 Tagging + 1
word.end <- length(R.wiki.word)
all.POS.tagged <- unlist(POStag$features[word.start:word.end]) # List 형식이어서 unlistall.POS.tagged
POS POS POS POS POS POS POS POS POS POS POS
"NN" "VBZ" "DT" "NN" "NN" "CC" "NN" "NN" "IN" "JJ" "NN"
POS POS POS POS POS POS POS POS POS POS POS
"CC" "NNS" "VBN" "IN" "DT" "NN" "NNP" "IN" "NNP" "NNP" "."
POS POS POS POS POS POS POS POS POS POS POS
"DT" "NN" "NN" "VBZ" "RB" "VBN" "IN" "NNS" "CC" "NNS" "NNS"
POS POS POS POS POS POS POS POS POS POS POS
"IN" "VBG" "JJ" "NN" "CC" "NNS" "NN" "." "NNS" "," "NNS"
POS POS POS POS POS POS POS POS POS POS POS
"IN" "NNS" "NNS" "," "CC" "NNS" "IN" "JJ" "NN" "NNS" "VBP"
POS POS POS POS POS POS POS POS POS POS POS
"IN" "NN" "POS" "NN" "VBZ" "VBN" "RB" "IN" "JJ" "NNS" "."
POS POS POS POS POS POS POS POS POS POS POS
"NN" "VBZ" "DT" "NNP" "NN" "." "DT" "NN" "NN" "IN" "DT"
POS POS POS POS POS POS POS POS POS POS POS
"NN" "NN" "NN" "VBZ" "VBN" "RB" "IN" "NNP" "," "NNP" ","
POS POS POS POS POS POS POS POS POS POS POS
"CC" "NNP" "NN" "VBZ" "RB" "JJ" "IN" "DT" "NNP" "NNP" "NNP"
POS POS POS POS POS POS POS POS POS POS POS
"NNP" "," "CC" "JJ" "JJ" "NNS" "VBP" "VBN" "IN" "JJ" "VBG"
POS POS POS POS POS POS POS POS POS POS POS
"NNS" "." "IN" "NN" "VBZ" "DT" "NN" "NN" "NN" "," "EX"
POS POS POS POS POS POS
"VBP" "JJ" "JJ" "NNS" "JJ" "."
table(all.POS.tagged)
all.POS.tagged
, . CC DT EX IN JJ NN NNP NNS POS RB VBG VBN VBP VBZ
6 6 7 8 1 14 11 26 11 15 1 4 2 5 3 7
[1] 127
my.PUNCT <- str_detect(all.POS.tagged,'[[:punct:]]') # 문장부호 갯수수
sum(my.PUNCT)
[1] 12
my.NN <- str_detect(all.POS.tagged,"NN$") # NN tag 갯수수
sum(my.NN)
[1] 26
my.NNs <- str_detect(all.POS.tagged,"NN") # NN으로 시작하는 모든 tag (NN, NNS, NNP, NNPS) 갯수 sum(my.NNs)
[1] 52
# 품사분석에 대한 함수
my.POStag.func <- function(mytext){
sent.annotate <- annotate(mytext,Maxent_Sent_Token_Annotator()) # 문장 Tag
word.annotate <- annotate(mytext,Maxent_Word_Token_Annotator(),sent.annotate) # 단어 Tag
POStag <- annotate(mytext,Maxent_POS_Tag_Annotator(),word.annotate) # 품사 분석석
myrange <- (1 + length(sent.annotate)):length(word.annotate)
my.POStag <- unlist(POStag$features[myrange])
my.POStag
}
mytext <- c("The sky is blue. Therefore, I feel happy.",
"The sun is bright today. My feeling is good.",
"The sun in the sky is bright. The sky is blue.",
"We can see the shining sun, the bright sun. The weather is nice today.")
corpus <- VCorpus(VectorSource(mytext)) # VectorSource : vector를 document로 해석석
mypaper1.POStag <- my.POStag.func(corpus[[1]]$content) # 첫번째 Corpus에 대한 품사 분석
mypaper1.POStag
POS POS POS POS POS POS POS POS POS POS POS
"DT" "NN" "VBZ" "JJ" "." "RB" "," "PRP" "VBP" "JJ" "."
sum(str_detect(mypaper1.POStag,"NN")) # 명사+대명사 개수수
[1] 1
# 전체 Corpus에서 등장한 명사+대명사의 비율NAN_corpus <- length(corpus) # Corpus 갯수수
compare.noun <- rep(NA,N_corpus) # 빈 변수 생성생성
for (i in 1:N_corpus){
my.NN <- sum(str_detect(my.POStag.func(corpus[[i]]$content),"NN")) # 각 Corpus마다 NN으로 시작하는 모든 Tag의 합NAall.POS <- sum(table(my.POStag.func(corpus[[i]]$content))) # 전체 Tag의 합NAcompare.noun[i] <- my.NN/all.POS # 비율
}
round(compare.noun,2)
[1] 0.09 0.27 0.23 0.24
#최고비율의, 최저비율의 명사+대명사 비율 NAprop.noun <- data.frame(1:N_corpus,compare.noun)
colnames(prop.noun) <- c('abstract.no','prop.noun')
head(prop.noun[order(prop.noun$prop.noun),],1)
abstract.no prop.noun
1 1 0.09090909
abstract.no prop.noun
2 2 0.2727273
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".