R/rss.R

Defines functions getrss

Documented in getrss

#' Fetch rss into tibble
#' @param feed feed address
#' @return tibble object
#' @seealso getpubmed
#' @examples \dontrun{
#' library(scifetch)
#' feed <- 'yihui.name/index.xml'
#' z <- getrss(feed)
#' }
#' @export
getrss <- function(feed){
        # This function is modified from tidyRSS and credits should belong to the author of tidyRSS package
        formats <- c("a d b Y H:M:S z", "a, d b Y H:M z",
                     "Y-m-d H:M:S z", "d b Y H:M:S",
                     "d b Y H:M:S z", "a b d H:M:S z Y",
                     "a b dH:M:S Y", "Y-m-d")
        json_parse <- function(feed){

                res <- jsonlite::fromJSON(feed)

                items <- res$items

                results <- tibble::tibble(
                        title = items$title,
                        date = lubridate::parse_date_time(items$date_published, orders = formats) %>%
                                as.character() %>% substr(1,10),
                        linkTitle = items$url,
                        source = res$title,
                        description = items$content_html
                )

                return(results)
        }

        rss_parse <- function(doc){

                channel <- xml2::xml_find_all(doc, "channel")

                if(identical(length(channel), 0L)){
                        ns <- xml2::xml_ns_rename(xml2::xml_ns(doc), d1 = "rss")
                        channel <- xml2::xml_find_all(doc, "rss:channel", ns = ns)
                        site <- xml2::xml_find_all(doc, "rss:item", ns = ns)

                        res <- suppressWarnings({tibble::tibble(
                                title = xml2::xml_text(xml2::xml_find_all(site, "rss:title", ns = ns)),
                                date = xml2::xml_text(xml2::xml_find_first(site, "rss:pubDate", ns = ns)) %>%
                                        lubridate::parse_date_time(orders = formats) %>%
                                        as.character() %>% substr(1,10),
                                linkTitle = xml2::xml_text(xml2::xml_find_all(site, "rss:link", ns = ns)),
                                source = xml2::xml_text(xml2::xml_find_first(channel, "rss:title", ns = ns)),
                                description = xml2::xml_text(xml2::xml_find_first(site, "rss:description", ns = ns))
                        )})
                } else{

                        site <- xml2::xml_find_all(channel, "item")

                        res <- suppressWarnings({tibble::tibble(
                                title = xml2::xml_text(xml2::xml_find_first(site, "title")),
                                date = xml2::xml_text(xml2::xml_find_first(site, "pubDate")) %>%
                                        lubridate::parse_date_time(orders = formats) %>%
                                        as.character() %>% substr(1,10),
                                linkTitle = xml2::xml_text(xml2::xml_find_first(site, "link")),
                                source = xml2::xml_text(xml2::xml_find_first(channel, "title")),
                                description = xml2::xml_text(xml2::xml_find_first(site, "description"))
                        )})

                        res <- Filter(function(x) !all(is.na(x)), res)

                        return(res)
                }
        }
        atom_parse <- function(doc){
                channel <- xml2::xml_find_all(doc, "channel", ns = xml2::xml_ns(doc))
                site <- xml2::xml_find_all(channel, "item")


                res <- tibble::tibble(
                        title = xml2::xml_text(xml2::xml_find_first(site, ns = xml2::xml_ns(doc), "title")),
                        date = xml2::xml_text(xml2::xml_find_first(site, ns = xml2::xml_ns(doc),
                                                                                "pubDate")) %>%
                                lubridate::parse_date_time(orders = formats) %>%
                                as.character() %>% substr(1,10),
                        linkTitle = xml2::xml_text(xml2::xml_find_first(site, ns = xml2::xml_ns(doc),
                                                                        "link")),
                        source = xml2::xml_text(xml2::xml_find_first(channel, ns = xml2::xml_ns(doc), "title")),
                        description = xml2::xml_text(xml2::xml_find_all(site, ns = xml2::xml_ns(doc), "description")))
                return(res)
        }

        invisible({
                suppressWarnings({
                        stopifnot(identical(length(feed), 1L)) # exit if more than 1 feed provided

                        msg <- "Error in feed parse; please check URL."

                        # skip the ssl verify
                        httr::set_config(httr::config(ssl_verifypeer = 0L,ssl_verifyhost=0L))
                        doc <- try(httr::GET(feed), silent = TRUE)

                        if(any(grepl("xml", doc$all_headers))){
                                doc <- doc %>%
                                        xml2::read_xml()
                        } else{
                                result <- json_parse(feed)
                                if(NROW(result)>0){
                                        return(result)
                                }else{
                                        return(NULL)
                                }

                        }

                        if(unique(grepl('try-error', class(doc)))){
                                stop(msg)
                        }


                        if(grepl("http://www.w3.org/2005/Atom", doc)){

                                result <- atom_parse(doc)

                                if(NROW(result)>0){
                                        return(result)
                                }else{
                                        return(NULL)
                                }

                        } else{
                                result <- rss_parse(doc)

                                if(NROW(result)>0){
                                        return(result)
                                }else{
                                        return(NULL)
                                }
                        }
                })
        })
}
yufree/scifetch documentation built on Oct. 15, 2022, 7:25 p.m.