R/qnews_extract_article.R

Defines functions qnews_extract_article

Documented in qnews_extract_article

#' Extract article content from online news sources.
#'
#' @name qnews_extract_article
#' @param x A vector of URLs
#' @param cores An integer value specifying n threads
#' @return A data frame
#'
#'
#' @export
#' @rdname qnews_extract_article
#'
#'
qnews_extract_article <- function(x,
                                  cores) {

  batches <- split(x, ceiling(seq_along(x)/20))

  build_table <- function (url0) {

    x0 <- lapply(url0, function(q) {
      y0 <- get_site(q)
      y1 <- annotate_site(site = y0)

      y2 <- subset(y1, y1$discard == 'keep')
      data.table::setDT(y2)
      y2[, list(text = paste(text, collapse = " ")),
         by = list(url, h1_title)]
    })

    data.table::rbindlist(x0)
  }


  clust <- parallel::makeCluster(cores)
  parallel::clusterExport(cl = clust,
                          varlist = c('batches'),
                          envir = environment())

  docs <- pbapply::pblapply(cl = clust,
                            X = batches,
                            FUN = build_table)

  parallel::stopCluster(clust)

  data.table::rbindlist(docs)
}
jaytimm/quicknews documentation built on Aug. 23, 2023, 12:09 a.m.