Various Applications through Crawling the Lottery Winning Number
로또 번호 웹 크롤링 방법을 공부해서 정리한 코드
pacman::p_load("XML", "dplyr", "reshape2",
"ggplot2", "gridExtra",
"forcats") # For fct_infreq
# Crawling ----------------------------------------------------------------
address <- "https://dhlottery.co.kr/gameResult.do?method=allWinExel&gubun=byWin&nowPage=&drwNoStart=1&drwNoEnd=9999" # F12를 통해 볼 수 있음.
download.file(address, destfile = "test.xls") # 주소 데이터를 xls로 저장.NASys.setlocale(category = "LC_ALL", locale = "us") # 영어로 언어설정설정
[1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
tables <- XML::readHTMLTable("test.xls",skip.rows= 1)
history_lotto <- tables[[2]]
Sys.setlocale("LC_ALL", "korean")
[1] "LC_COLLATE=Korean_Korea.949;LC_CTYPE=Korean_Korea.949;LC_MONETARY=Korean_Korea.949;LC_NUMERIC=C;LC_TIME=Korean_Korea.949"
NA.num <- history_lotto[!is.na(history_lotto$V20), ] %>% # 마지막 행이 NA가 아닌 행 찾기 => V1이 년도로 인해 밀린 행들NA.[,2:20] # 첫번째 행 삭제
names(NA.num) <- paste0("V", 1:19) # For bind_rows
# 분석에 쓰일 최종 데이터
history_lotto1 <- history_lotto[is.na(history_lotto$V20), 1:19] %>%
bind_rows(NA.num) %>%
arrange(as.numeric(V1)) %>%
mutate_at(vars(V13, V14, V15, V16, V17, V18, V19), as.numeric) # Convert Factor
names(history_lotto1)[13:19] <- c("One", "Two", "Three", "Four", "Five", "Six", "Bonus")
# str(history_lotto1)
# Year 변수 추가가
history_lotto1$year <- history_lotto1$V2 %>%
as.Date(format = "%Y.%m.%d") %>%
lubridate::year()
# Month 변수 추가가
history_lotto1$month <- history_lotto1$V2 %>%
as.Date(format = "%Y.%m.%d") %>%
lubridate::month()
winning_number_mat <- history_lotto1[,c(1:2, 13:18)] %>%
arrange(desc(as.numeric(V1)))
names(winning_number_mat)[1:2] <- c("Episode ", "Date")
the_latest_number<-function(x=5){
winning_number_mat[1:x,]
}
the_latest_number(x=10)
Episode Date One Two Three Four Five Six
1 995 2021.12.25 1 31 4 20 28 22
2 943 2020.12.26 1 35 4 27 34 28
3 891 2019.12.28 31 5 19 22 29 24
4 839 2018.12.29 23 36 2 3 3 2
5 787 2017.12.30 26 33 4 7 17 11
6 735 2016.12.31 26 2 4 18 27 24
7 682 2015.12.26 10 16 18 26 28 26
8 630 2014.12.27 30 9 12 15 17 14
9 578 2013.12.28 26 4 5 23 24 25
10 56 2003.12.27 2 6 22 22 23 20
# 보너스를 제외한 모든 번호번호
Num.all <- history_lotto1[,c(1, 13:18, 20:21)] %>%
reshape2::melt(id.var = c("V1", "year", "month"),
variable.name="Location",
value.name="Number")
# Bonus 번호
Bonus <- history_lotto1[,c(1, 19, 20:21)]
# 시각화화
# options(repr.plot.width = 1, repr.plot.height = 0.75)
ggplot(data.frame(Num.all), aes(fct_infreq(as.factor(Number)))) + # fct_infreq : Ordering
geom_bar(fill="#6666FF",color="black") +
labs(x="Number", y="Count", title = "Frequency of Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
## 연도별연도별
ggplot(data.frame(Num.all), aes(as.factor(Number))) +
geom_bar(fill="#6666FF",color="black") +
facet_wrap(~year, ncol = 2) +
labs(x="Number", y="Count", title = "Frequency of Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
## 월별별
ggplot(data.frame(Num.all), aes(as.factor(Number))) +
geom_bar(fill="#6666FF",color="black") +
facet_wrap(~month, ncol = 2) +
labs(x="Number", y="Count", title = "Frequency of Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
# For Bonus
ggplot(data.frame(Bonus), aes(fct_infreq(as.factor(Bonus)))) + # fct_infreq : Ordering
geom_bar(fill="#FFCC66",color="black") +
labs(x="Number", y="Count", title = "Frequency of Bonus Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
## 연도별연도별
ggplot(data.frame(Bonus), aes(fct_infreq(as.factor(Bonus)))) + # fct_infreq : Ordering
geom_bar(fill="#FFCC66",color="black") +
facet_wrap(~year, ncol = 2) +
labs(x="Number", y="Count", title = "Frequency of Bonus Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
ggplot(data.frame(Bonus), aes(fct_infreq(as.factor(Bonus)))) + # fct_infreq : Ordering
geom_bar(fill="#FFCC66",color="black") +
facet_wrap(~month, ncol = 2) +
labs(x="Number", y="Count", title = "Frequency of Bonus Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
Recent.episode <- max(as.numeric(history_lotto1$V1)) # 최신 회차NARecent.episode.20 <- seq(Recent.episode, Recent.episode-20) # 최신 회차에서 20전 회차NA# 보너스를 제외한 모든 번호번호
Num.all.20 <- Num.all %>%
filter(as.numeric(V1) >= min(Recent.episode.20))
# Bonus 번호
Bonus.20 <- Bonus %>%
filter(as.numeric(V1) >= min(Recent.episode.20))
# 시각화화
ggplot(data.frame(Num.all.20), aes(fct_infreq(as.factor(Number)))) + # fct_infreq : Ordering
geom_bar(fill="#6666FF",color="black") +
labs(x="Number", y="Count", title = "Frequency of Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
# For Bonus
ggplot(data.frame(Bonus.20), aes(fct_infreq(as.factor(Bonus)))) + # fct_infreq : Ordering
geom_bar(fill="#FFCC66",color="black") +
labs(x="Number", y="Count", title = "Frequency of Bonus Number") +
geom_text(stat='count', aes(label=..count..), vjust=-1) + # Label
theme_bw()
frequency <- table(Num.all$Number) %>%
data.frame()
lotto11 <- function(data, n=3){
for(i in 1:n){
p = matrix(0)
sumData = sum(data$Freq)
p = data$Freq/sumData # 빈도 가중치치
x = runif(45,0,1) # 랜덤 가중치치
p = p*x # 두 가중치를 곱함
sortP <- sort(p, decreasing=TRUE) # 정렬
cat("당첨번호: ")
for(m in 1:7){
for(j in 1:length(p)){
if(sortP[m]==p[j]){ cat(j," "); break }
}
}
cat("\n")
}
}
lotto11(data = frequency, n = 5)
당첨번호: 28 26 23 17 1 13 44
당첨번호: 23 26 25 1 40 29 5
당첨번호: 25 13 18 20 26 1 23
당첨번호: 25 26 13 27 10 22 20
당첨번호: 13 25 28 24 26 5 29
# For Bonus
frequency.Bonus <- table(Bonus$Bonus) %>%
data.frame()
lotto11.Bonus <- function(data, n=3){
for(i in 1:n){
p = matrix(0)
sumData = sum(data$Freq)
p = data$Freq/sumData # 빈도 가중치치
x = runif(45,0,1) # 랜덤 가중치치
p = p*x # 두 가중치를 곱함
sortP <- sort(p, decreasing=TRUE) # 정렬
cat("당첨번호: ")
for(m in 1:1){
for(j in 1:length(p)){
if(sortP[m]==p[j]){ cat(j," "); break }
}
}
cat("\n")
}
}
lotto11.Bonus(frequency.Bonus, n=5)
당첨번호: 38
당첨번호: 19
당첨번호: 26
당첨번호: 26
당첨번호: 20
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".