Crawling :: example (clien.net)Published by onesixx on 20-08-0720-08-07 https://bluediary8.tistory.com/104 url > 규칙찾기 > 정제 R # clien> 모두의 공원 # https://www.clien.net/service/board/park # https://www.clien.net/service/board/park?&od=T31&po=1 # https://www.clien.net/service/board/park?&od=T31&po=2 ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1. 제목, 게시글Link 찾기 ---- ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # <div class="list_title " data-role="list-title" data-toggle-custom="dropdown"> # <a class="list_subject" href="/service/board/park/15313520?od=T31&po=0&category=&groupCd=" data-role="cut-string"> # # <span class="subject_fixed" data-role="list-title-text" title="메시가 이적하는 군요"> # 메시가 이적하는 군요 # </span> # </a> pageData=NULL for (i in 1:10){ #i=1 boardURL<-str_c('https://www.clien.net/service/board/park?&od=T31&po=',i-1) viewPageSrc <- readLines(boardURL, encoding="UTF-8") titles <- viewPageSrc[str_which(viewPageSrc, "subject_fixed")+1] #viewPageSrc[str_detect(viewPageSrc, "subject_fixed")] #titles %>% str_replace_all("<.*?>","") # tag 지우기 titles <- titles %>% str_replace_all("\t","") title_urls <- viewPageSrc[str_which(viewPageSrc, "subject_fixed")-2] title_urls <- title_urls %>% str_extract("(?<=href).*(?=data)") %>% str_sub(3,-3) title_urls <- str_c("https://www.clien.net",title_urls) titleList <- cbind(titles, title_urls) pageData <- rbind(pageData, titleList) cat("\n",i) } #pageData %>% write.csv("pageData.csv", row.names=F) #pageData <- fread("pageData.csv") ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 2. 게시글 내용 수집 ---- ###### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # <div class="post_content"> # <article> # <div class="post_article"> # <html> # <body> # <p><span class="outlink"><a class="url" href="https://twitter.com/Catradioesports/status/1298317811610267648?s=20" target="_blank">https://twitter.com/Catradioesports/status/1298317811610267648?s=20</a></span></p> # <p><br></p> # <p><br></p> # <p><br></p> # <p>이 정도면 거의 확정적으로 보입니다. </p> # <p><br></p> # <p><br></p> # <p>세상에 </p> # </body> # </div> # </article> # <div class="post_ccls"> final_content <- pageData[ , title_urls] %>% lapply( function(titleURLs){ #titleURLs <- pageData[ , title_urls][15] tryCatch({ contents <- titleURLs %>% readLines(encoding="UTF-8") contents <- contents[ str_which(contents, "<body")[2]:str_which(contents, "/body>")[1] ] # contents[ str_which(contents, "post_content"):str_which(contents, "post_ccls")] contents <- contents %>% str_replace_all("<.*?>|\t| ", "") contents <- contents[nchar(str_trim(contents))>0] %>% str_trim() # remove blank contents <- contents %>% str_c(collapse='\n') }, error=function(e){ contents <- NULL }) contents }) pageData[ , contents:=unlist(final_content)] pageData %>% write.csv("pageData.csv", row.names=F) Categories: quant onesixx Blog Owner {} [+] Name* Email {} [+] Name* Email 0 Comments Inline Feedbacks View all comments