R/comic_scraper.R

Defines functions download_xkcd_comic download_phd_comic

Documented in download_phd_comic download_xkcd_comic

#' @title
#' Webscrape xkcd comics
#'
#' @description
#' Download xkcd comics based upon the comic id found from the xkcd site.
#' Use in conjunction with purrr::walk to webscrape many comics
#'
#' @param comic_id
#' numeric ID number of the comic from xkcd
#'
#' @param wait_seconds
#' interger in seconds to wait before downloading the comic
#'
#' @param save_dir
#' string of where to save the downloaded comic
#'
#' @examples
#' download_xkcd_comic(comic_id = 1234, wait_seconds = 5, save_dir = 'xkcd_comics')
#'
#' walk(1:1000, ~ download_xkcd_comic(comic_id = .x, wait_seconds = 5, save_dir = 'xkcd_comics'))
#'
#' @export

download_xkcd_comic <- function(comic_id, wait_seconds = 0, save_dir = "xkcd_comics") {
  primary_url <- "https://xkcd.com/"

  comic_url <- paste0(primary_url, comic_id)

  s <- rvest::html_session(url = comic_url)

  `%>%` <- magrittr::`%>%`

  file_name <- s %>%
    rvest::html_nodes("img") %>%
    .[[2]] %>%
    rvest::html_attr(., "src")

  if (dir.exists(save_dir) == F) {
    message(paste('Creating folder', save_dir, 'to save comics'))
    dir.create(paste0(save_dir, "/"))
  }

  if (!is.null(wait_seconds)) {
    Sys.sleep(wait_seconds)
    message(paste("Waiting", wait_seconds, "seconds before starting download"))
  }

  download.file(url = paste0("https://imgs.xkcd.com/comics/", basename(file_name)), destfile = paste0(save_dir, "/", basename(file_name)), mode = "wb")

  message(paste("Downloaded", basename(file_name), "successful!"))
}

#' @title
#' Webscrape Phd comics
#'
#' @description
#' Download Phd comics based upon the comic id found from the Phd comic site.
#' Use in conjunction with purrr::walk to webscrape many comics
#'
#' @param comic_id
#' numeric ID number of the comic from Phd
#'
#' @param wait_seconds
#' interger in seconds to wait before downloading the comic
#'
#' @param save_dir
#' string of where to save the downloaded comic
#'
#' @examples
#' download_phd_comic(comic_id = 1234, wait_seconds = 5, save_dir = 'phd_comics')
#'
#' walk(1:1000, ~ download_phd_comic(comic_id = .x, wait_seconds = 5, save_dir = 'phd_comics'))
#'
#' @export
#'

download_phd_comic <- function(comic_id, wait_seconds = NULL, save_dir = 'phd_comics') {
  url <- 'http://phdcomics.com/comics/archive.php?comicid='
  comic_url <- paste0(url, comic_id)
  s <- rvest::html_session(url = comic_url)

  `%>%` <- magrittr::`%>%`

  file_url <- rvest::html_session(comic_url) %>%
    rvest::html_nodes(., 'meta') %>%
    rvest::html_attr('content') %>%
    .[[3]]

  if (dir.exists(save_dir) == F) {
    message(paste('Creating folder', save_dir, 'to save comics'))
    dir.create(paste0(save_dir, "/"))
  }

  if (!is.null(wait_seconds)) {
    Sys.sleep(wait_seconds)
    message(paste('Waiting', wait_seconds, 'seconds before starting download'))
  }

  download.file(url = paste0(file_url), destfile = paste0(save_dir, '/', basename(file_url)), mode = 'wb')

  message(paste('Download', basename(file_url), 'successful!'))

}
mluu921/comicscrapeR documentation built on May 27, 2019, 1:08 p.m.