R/getContent.R

#' Get Content
#'
#' Get naver news content from links.
#'
#' @param url is naver news link.
#' @param col is what you want to get from news. Defualt is all.
#' @return Get data.frame(url,datetime,press,title,content).
#' @export
#' @import RCurl
#' @import xml2
#' @import rvest
#' @import stringr

getContent <- function(url = url, col=c("url","datetime","press","title","content")) {

  if(!identical(url,character(0))){
    if (RCurl::url.exists(url)&
       "error_msg 404"!=(read_html(url)%>%html_nodes("div#main_content div div")%>%html_attr("class"))[1]
        ) {

        tem <- read_html(url)
        title <- tem %>% html_nodes("div.article_info h3") %>% html_text()
        Encoding(title) <- "UTF-8"

        datetime <- tem %>% html_nodes("span.t11") %>% html_text()
        datetime <- as.POSIXlt(datetime)

        if (length(datetime) == 1) {
            edittime <- datetime[1]
        }
        if (length(datetime) == 2) {
            edittime <- datetime[2]
            datetime <- datetime[1]
        }

        press <- tem %>% html_nodes("div.article_header div a img") %>% html_attr("title")
        Encoding(press) <- "UTF-8"

        #content <- tem %>% html_nodes("div#articleBodyContents") %>% html_text()
        raw_content <- tem %>% html_nodes("div#articleBodyContents")
        content <- raw_content %>% html_nodes(xpath='//text()[(following::br)] | //text()[(preceding::br)]') %>% html_text()
        #contnet <- "test"
        Encoding(content) <- "UTF-8"
        content <- str_trim(content,side="both")
        content <- gsub("\r?\n|\r", " ", content)

        newsInfo <- data.frame(url = url, datetime = datetime, edittime = edittime, press = press, title = title, content = content, stringsAsFactors = F)

    } else {

        newsInfo <- data.frame(url = url, datetime = "page is moved.", edittime = "page is moved.", press = "page is moved.", title = "page is moved.", content = "page is moved.",
            stringsAsFactors = F)

    }
    return(newsInfo[,col])
  } else { print("no news links")

    newsInfo <- data.frame(url = "no news links", datetime = "no news links", edittime = "no news links", press = "no news links", title = "no news links", content = "no news links",
                           stringsAsFactors = F)
    return(newsInfo[,col])
    }
}
kwanhong66/n2h4_modified documentation built on May 20, 2019, 7:07 p.m.