R/rss_parse.R

Defines functions rss_parse

rss_parse <- function(response, list, clean_tags, parse_dates) {
  # spec here: https://validator.w3.org/feed/docs/rss2.html
  res <- read_xml(response, options = "HUGE")
  geocheck(res)

  channel <- xml_find_first(res, "//*[name()='channel']")
  # meta data. Necessary: title, link, description
  metadata <- tibble(
    feed_title = xml_find_first(channel, "//*[name()='title']") %>% xml_text(),
    feed_link = xml_find_first(channel, "//*[name()='link']") %>% xml_text(),
    feed_description = xml_find_first(channel, "//*[name()='description']") %>%
      xml_text()
  )
  # optional metadata: language, copyright, managingEditor, webMaster, pubDate,
  # lastBuildDate; category, generator, docs, cloud, link, managingEditor,
  # podcast:guid, podcast:license, podcast:locked, podcast:funding,
  # podcast:location, podcast:trailer, ttl, image, textInput,
  # skipHours, skipDays
  meta_optional <- tibble(
    feed_language = safe_run(channel, "first", "//*[name()='language']"),
    feed_managing_editor = safe_run(channel,
                                    "first", "//*[name()='managingEditor']"),
    feed_web_master = safe_run(channel, "first", "//*[name()='webMaster']"),
    feed_pub_date = safe_run(channel, "first", "//*[name()='pubDate']"),
    feed_last_build_date = safe_run(channel,
                                    "first", "//*[name()='lastBuildDate']"),
    feed_category = list(category = safe_run(
      channel, "first", "//*[name()='category']"
      )),
    feed_generator = safe_run(channel, "first", "//*[name()='generator']"),
    feed_docs = safe_run(channel, "first", "//*[name()='docs']"),
    feed_managingEditor = safe_run(channel, "first", "//*[name()='managingEditor']"),
    feed_webMaster = safe_run(channel, "first", "//*[name()='webMaster']"),
    feed_guid = safe_run(channel, "first", "//*[name()='podcast:guid']"),
    feed_license = safe_run(channel, "first", "//*[name()='podcast:license']"),
    feed_locked = safe_run(channel, "first", "//*[name()='podcast:locked']"),
    feed_funding = safe_run(channel, "first", "//*[name()='podcast:funding']"),
    feed_location = safe_run(channel, "first", "//*[name()='podcast:location']"),
    feed_trailer = safe_run(channel, "first", "//*[name()='podcast:trailer']"),
    feed_ttl = safe_run(channel, "first", "//*[name()='ttl']")
  )
  meta <- bind_cols(metadata, meta_optional)
  # entries
  # necessary: title or description
  res_entry <- xml_find_all(channel, "//*[name()='item']") %>% as_list()
  res_entry_xml <- xml_find_all(channel, "//*[name()='item']")

  entries <- tibble(
    item_title = map(res_entry, "title", .default = def) %>% unlist(),
    item_link = map(res_entry, "link", .default = def) %>% unlist(),
    item_description = map(res_entry, "description", .default = def) %>%
      replace_null() %>% discard(safe_check_comment) %>% unlist(),
    item_pub_date = map(res_entry, "pubDate", .default = def) %>% unlist(),
    item_guid = map(res_entry, "guid", .default = def) %>% unlist(),
    item_author = map(res_entry, "author", .default = def),
    item_season = map(res_entry, "podcast:season", .default = def),
    item_episode = map(res_entry, "podcast:episode", .default = def),
    item_enclosure = map(res_entry, "enclosure", .default = def),
    item_category = map(res_entry_xml, ~ {
      xml_find_all(.x, "category") %>% map(xml_text)
    }),
    item_comments = map(res_entry, "comments", .default = def) %>% unlist(),
    item_duration = map(res_entry, "duration", .default = def) %>% unlist()
  )

  # clean up
  meta <- clean_up(meta, "rss", clean_tags, parse_dates)
  entries <- clean_up(entries, "rss", clean_tags, parse_dates)

  if (isTRUE(list)) {
    out <- list(meta = meta, entries = entries) # nocov
    return(out) # nocov
  } else {
    if (!has_name(meta, "feed_title")) {
      meta$feed_title <- NA_character_ # nocov
    }
    entries$feed_title <- meta$feed_title
    out <- suppressMessages(safe_join(meta, entries))
    if (is.null(out$error)) {
      out <- out$result
      if (all(is.na(out$feed_title))) out <- out %>% select(-feed_title) # nocov
      return(out)
    } else {
      # nocov start
      meta$tmp <- "temp"
      entries$tmp <- "temp"
      out <- suppressMessages(full_join(meta, entries))
      out <- out %>% select(-tmp)
      return(out)
      # nocov end
    }
  }
}
RobertMyles/tidyRSS documentation built on April 23, 2024, 3:35 a.m.