R/getContent.R

Defines functions getSection getOriginalUrl getContentBody getContentPress getContentEditDatetime getContentDatetime getContentTitle getContent

Documented in getContent

#' Get Content
#'
#' Get naver news content from links.
#'
#' @param turl is naver news link.
#' @param col is what you want to get from news. Defualt is all.
#' @return a [tibble][tibble::tibble-package]
#' @export
#' @importFrom httr2 request req_user_agent req_method req_perform resp_body_html
#' @importFrom rvest html_nodes html_text html_attr
#' @examples
#' \dontrun{
#'   getContent("https://n.news.naver.com/mnews/article/214/0001195110?sid=103")
#'   }

getContent <-
  function(turl,
           col = c("url",
                   "original_url",
                   "section",
                   "datetime",
                   "edittime",
                   "press",
                   "title",
                   "body",
                   "value")) {

    httr2::request(turl) %>%
      httr2::req_user_agent("N2H4 by chanyub.park <mrchypark@gmail.com>") %>%
      httr2::req_method("GET") %>%
      httr2::req_perform() -> root

    html_obj <- httr2::resp_body_html(root)
    urlcheck <- root$url
    value <- T
    if (identical(grep("^https?://n.news.naver.com",
                       urlcheck),
                  integer(0))) {
      original_url <- "page is not news section."
      title <- "page is not news section."
      datetime <- "page is not news section."
      edittime <- "page is not news section."
      press <- "page is not news section."
      body <- "page is not news section."
      section <- "page is not news section."
      value <- F
    } else {
      # TODO: 이거 동작하는지 확인해야 함.
      chk <- rvest::html_nodes(html_obj, "div#main_content div div")
      chk <- rvest::html_attr(chk, "class")
      chk <- chk[1]
      if (is.na(chk)) {
        chk <- "not error"
      }
      if ("error_msg 404" == chk & value) {
        original_url <- "page is moved."
        title <- "page is moved."
        datetime <- "page is moved."
        edittime <- "page is moved."
        press <- "page is moved."
        body <- "page is moved."
        section <- "page is moved."
        value <- F
      }
    }
    if (value) {
      original_url <- getOriginalUrl(html_obj)
      title <- getContentTitle(html_obj)
      datetime <- getContentDatetime(html_obj)
      edittime <- getContentEditDatetime(html_obj)
      press <- getContentPress(html_obj)
      body <- getContentBody(html_obj)
      section <- getSection(turl)
    }

    if (length(edittime) == 0) {
      edittime <- NA
    }
    newsInfo <- tibble::tibble(
      url = turl,
      original_url = original_url,
      datetime = datetime,
      edittime = edittime,
      press = press,
      title = title,
      body = body,
      section = section,
      value = value
    )
    return(newsInfo[, col])
  }

getContentTitle <-
  function(html_obj,
           title_node_info = "h2.media_end_head_headline",
           title_attr = "") {
    node <- rvest::html_nodes(html_obj, title_node_info)
    title <- rvest::html_text(node)
    if (title_attr != "") {
      title <- rvest::html_attr(node, title_attr)
    }
    return(title)
  }


getContentDatetime <-
  function(html_obj,
           datetime_node_info = "span._ARTICLE_DATE_TIME",
           datetime_attr = "data-date-time") {
    node <- rvest::html_nodes(html_obj, datetime_node_info)
    datetime <- rvest::html_text(node)
    if (datetime_attr != "") {
      datetime <- rvest::html_attr(node, datetime_attr)
    }
    as.POSIXct(datetime, tz = "Asia/Seoul")
  }

getContentEditDatetime <-
  function(html_obj,
           datetime_node_info = "span._ARTICLE_MODIFY_DATE_TIME",
           datetime_attr = "data-modify-date-time") {
    node <- rvest::html_nodes(html_obj, datetime_node_info)
    datetime <- rvest::html_text(node)
    if (datetime_attr != "") {
      datetime <- rvest::html_attr(node, datetime_attr)
    }
    as.POSIXct(datetime, tz = "Asia/Seoul")
  }

getContentPress <-
  function(html_obj,
           press_node_info = "div.media_end_head_top a img",
           press_attr = "title") {
    node <- rvest::html_nodes(html_obj, press_node_info)
    press <- rvest::html_text(node)
    if (press_attr != "") {
      press <- rvest::html_attr(node, press_attr)
    }
    return(press[1])
  }

getContentBody <-
  function(html_obj,
           body_node_info = "div#dic_area",
           body_attr = "") {
    node <- rvest::html_nodes(html_obj, body_node_info)
    body <- rvest::html_text(node)
    if (body_attr != "") {
      body <- rvest::html_attr(node, body_attr)
    }
    body <- trimws(gsub("\r?\n|\r|\t|\n", " ", body))
    return(body)
  }

getOriginalUrl <-   function(html_obj,
                             origin_url_node_info = "a.media_end_head_origin_link",
                             origin_url_attr = "href") {
  node <- rvest::html_nodes(html_obj, origin_url_node_info)
  body <- rvest::html_text(node)
  if (origin_url_attr != "") {
    body <- rvest::html_attr(node, origin_url_attr)
  }
  body <- trimws(gsub("\r?\n|\r|\t|\n", " ", body))
  return(body)
}

#' @importFrom httr2 url_parse
getSection <- function(turl) {
  if (is.null(httr2::url_parse(turl)$query$sid)) {
    return(NA)
  }
  return(httr2::url_parse(turl)$query$sid)
}

Try the N2H4 package in your browser

Any scripts or data that you put into this service are public.

N2H4 documentation built on Sept. 15, 2022, 5:14 p.m.