R/get_site.R

Defines functions get_site

Documented in get_site

#' Get site html.
#'
#' @name get_site
#' @param x A vector of URLs
#' @return A data frame
#'
#'
#' @export
#' @rdname get_site
#'
get_site <- function(x) {

  site <- tryCatch(
    xml2::read_html(httr::GET(x, httr::timeout(60))),
    # xml2::read_html(url),
    error = function(e) paste("Error"))

  if(any(site == 'Error')) {
    articles <- data.frame(url = x, type = '', text = '') } else{

                 ntype1 <- 'p,h1,h2,h3'
                 w0 <- rvest::html_nodes(site, ntype1)

                 if(length(w0) == 0) {
                   w1 <- 'no dice'
                   w2 <- 'no dice' } else{
                     w1 <- rvest::html_name(w0)
                     w2 <- rvest::html_text(w0)
                   }
                 if(any(!validUTF8(w2))){
                   w1 <- 'no dice'
                   w2 <- 'no dice'}

                 data.frame(url = x,
                            type = w1,
                            text = w2)

    }

}


# get_meta <- function(x, url = url) {
#
#   feats <- rvest::html_nodes(x, 'meta')
#   feats <- rvest::html_attrs(feats)
#
#   feats <- Filter(function(x) length(x) != 1 , feats)
#
#   feats0 <- lapply(feats, function(x){
#
#     if(length(x) > 2) {
#       x <- tail(x, 2)
#     }
#
#     names(x) <-  c('content', 'value')
#     x1 <- data.frame(x)
#     x1$type <- rownames(x1)
#     return(x1)})
#
#   feats1 <- data.table::rbindlist(feats0, idcol = 'id')
#   feats1$doc_id <- url
#   feats2 <- data.table::dcast(feats1, doc_id + id~type, value.var = 'x')
#   feats2
# }
jaytimm/quicknews documentation built on Aug. 23, 2023, 12:09 a.m.