R/top_grossing.R

Defines functions get_rank_data clean_top_grossing top_grossing

Documented in top_grossing

#' Get the top N ranking grossing movies
#'
#' @param type
#' A string that says which type of box office sorted ranking you want. Options are
#' 'american' (American box office), 'international' (non-American) and
#' 'worldwide' (domestic + international box office).
#'
#' @param ranks
#' A vector of integers for the rankings you want it to return. For example.
#' an input of 1:5 will return the top 5 grossing movies.
#'
#' @return
#' Data frame returning info on the name of the movie, it's rank,
#' the year the movie was released, and the total gross from
#' domestic (American), international, and total ticket sales.
#' @export
#'
#' @examples
#' top_grossing()
#'
#' top_grossing(ranks = 1:5)
#' top_grossing(ranks = 201:205)
#'
#' top_grossing(type = "international")
#' top_grossing(type = "international", ranks = 1:10)
top_grossing <- function(type = "american",
                         ranks = 1:100) {


  stopifnot(is.numeric(ranks) && is.character(type) && length(type) == 1)
  type <- tolower(type)

  if (!type %in% c("american", "international", "worldwide")) {
    stop(paste("type must be one of the following:",
               "'domestic' - Ranked by domestic gross (United States), all movies",
               "'internatonal' - Ranked by international gross, all movies",
               "'worldwide' - Ranked by worldwide gross (domestic + international), all movies",
               sep = "\n"))
  }

  if (min(ranks) < 1) {
    stop("ranks cannot be a number below 1.")
  }


  if (type == "american") {
    type <- "domestic/all-movies"
  } else if (type == "international") {
    type <- "international/all-movies"
  } else if (type == "worldwide") {
    type <- "worldwide/all-movies"
  }
  url_start = paste0("https://www.the-numbers.com/box-office-records/",
                     type,
                     "/cumulative/all-time")
  final_data  <- data.frame()
  page_numbers <- seq(min(ranks), max(ranks), 100)
  if (min(ranks) %% 100 == 0) {
    page_numbers <- c(page_numbers, max(ranks))
  }
  for (i in page_numbers) {
    temp <- get_rank_data(url_start, i, type = type)
    if (is.null(temp)) {
      return(NULL)
    }
    final_data <- rbind(final_data, temp)
  }

  final_data <- clean_top_grossing(final_data, ranks)

  message("Please note that these numbers are not adjusted for inflation.")
  return(final_data)
}


clean_top_grossing <- function(data, ranks) {
  names(data) <- gsub(" ", "_", names(data))
  names(data) <- gsub("^Rank$",                    "rank", names(data))
  names(data) <- gsub("^Released$",                "year_released", names(data))
  names(data) <- gsub("^Year$"    ,                "year_released", names(data))
  names(data) <- gsub("^Movie$",                   "movie", names(data))
  names(data) <- gsub("^DomesticBox_Office$",      "american_box_office",
                      names(data))
  names(data) <- gsub("^InternationalBox_Office$",
                      "international_box_office", names(data))
  names(data) <- gsub("^WorldwideBox_Office$",
                      "total_box_office", names(data))
  data$rank                     <- numeric_cleaner(data$rank)
  data$year_released            <- numeric_cleaner(data$year_released)
  data$american_box_office      <- numeric_cleaner(data$american_box_office)
  data$international_box_office <- numeric_cleaner(data$international_box_office)
  data$total_box_office         <- numeric_cleaner(data$total_box_office)
  data <- data[data$rank %in% ranks, ]
  data <- data[, c("rank",
                   "movie",
                   "year_released",
                   "american_box_office",
                   "international_box_office",
                   "total_box_office")]
  rownames(data) <- 1:nrow(data)
  return(data)
}


get_rank_data <- function(url, page_number, type) {
  useragent <- paste0("Mozilla/5.0 (compatible; a bot using the R boxoffice",
                      " package; https://github.com/jacobkap/boxoffice/)")
  if (page_number == 1) {
    page_number <- ""
  } else {
    page_number <- paste0("/", page_number)
  }

  page <- tryCatch({
    httr::GET(paste0(url, page_number),
              httr::user_agent(useragent))
  }, error = function(e) {
    message(paste0(url, page_number, " could not be scraped. Please check ",
                   "that the website is available or ",
                   "check your internet connection."))
    return(NULL)
  })

  page <- httr::content(page, "parsed", encoding = "UTF-8")
  page <- rvest::html_nodes(page, "th , td")
  page <- rvest::html_text(page)
  if (type == "domestic/all-movies") {
    dim(page) <- c(7, length(page) / 7)
  } else {
    dim(page) <- c(6, length(page) / 6)
  }

  page <- t(page)
  page <- data.frame(page, stringsAsFactors = FALSE)

  names(page) <- page[1, ]
  page <- page[-1, ]
  return(page)
}
jacobkap/boxoffice documentation built on July 27, 2020, 4:55 a.m.