Crawling :: example (clien.net)

Published by onesixx on

https://bluediary8.tistory.com/104

url > 규칙찾기 > 정제

# clien> 모두의 공원
# https://www.clien.net/service/board/park
# https://www.clien.net/service/board/park?&od=T31&po=1
# https://www.clien.net/service/board/park?&od=T31&po=2

###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 1. 제목, 게시글Link 찾기 ----
###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 
# \t # \t\t # \t\t # \t\t\t메시가 이적하는 군요 # \t\t # \t pageData=NULL for (i in 1:10){ \t#i=1 \tboardURL<-str_c('https://www.clien.net/service/board/park?&od=T31&po=',i-1) \tviewPageSrc <- readLines(boardURL, encoding="UTF-8") \t \ttitles <- viewPageSrc[str_which(viewPageSrc, "subject_fixed")+1] \t#viewPageSrc[str_detect(viewPageSrc, "subject_fixed")] \t#titles %>% str_replace_all("<.*?>","") # tag 지우기 \ttitles <- titles %>% str_replace_all("\\t","") \t \ttitle_urls <- viewPageSrc[str_which(viewPageSrc, "subject_fixed")-2] \ttitle_urls <- title_urls %>% str_extract("(?<=href).*(?=data)") %>% \t\t str_sub(3,-3) \ttitle_urls <- str_c("https://www.clien.net",title_urls) \t \ttitleList <- cbind(titles, title_urls) \tpageData <- rbind(pageData, titleList) \t \tcat("\ ",i) } #pageData %>% write.csv("pageData.csv", row.names=F) #pageData <- fread("pageData.csv") ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 2. 게시글 내용 수집 ---- ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
# \t #
final_content <- pageData[ , title_urls] %>% lapply( function(titleURLs){ \t#titleURLs <- pageData[ , title_urls][15] \ttryCatch({ \t\tcontents <- titleURLs %>% readLines(encoding="UTF-8") \t\tcontents <- contents[ str_which(contents, "")[1] ] \t\t# contents[ str_which(contents, "post_content"):str_which(contents, "post_ccls")] \t\tcontents <- contents %>% str_replace_all("<.*?>|\\t| ", "") \t\tcontents <- contents[nchar(str_trim(contents))>0] %>% str_trim() # remove blank \t\tcontents <- contents %>% str_c(collapse='\ ') \t}, \terror=function(e){ \t\tcontents <- NULL \t}) \tcontents }) pageData[ , contents:=unlist(final_content)] pageData %>% write.csv("pageData.csv", row.names=F)
Categories: quant

onesixx

Blog Owner

Subscribe
Notify of
guest

0 Comments
Inline Feedbacks
View all comments
0
Would love your thoughts, please comment.x
()
x