R/getting_ratings.R

Defines functions grab_imdb_ratings

Documented in grab_imdb_ratings

#' Go Get IMDb Ratings
#'
#' Go get the data about your favorite show on IMDb. Grabs ratings for every episode of selected seasons.
#'
#' @param imdb_code String in URL that identifies the desired show (format: "tt<numbers>")
#' @param seasons Vector of the seasons to include
#' @return Dataframe of the desired show's episodes (tibble)
#' @details Breakdown of dataframe contents
#' \itemize{
#'  \item{"show"}{: Name of the show}
#'  \item{"season"}{: Season Number}
#'  \item{"episode"}{: Episode Number}
#'  \item{"air_date"}{: Date the episode originally aired}
#'  \item{"title"}{: Name of the Episode}
#'  \item{"rating"}{: IMDb rating for the episode}
#'  \item{"votes"}{: How many votes the IMDb rating is based off of}
#' }
#' @examples
#' # The Magicians (https://www.imdb.com/title/tt4254242/)
#' grab_imdb_ratings("tt4254242", c(1:5))
#' @export
grab_imdb_ratings <- function(imdb_code, seasons) {
  # Grabbing Rating Data for a show on IMDb
  #
  # - imdb_code: url code for a given show (the "tt<number_string>" in the url)
  # - seasons: list of desired seasons

  # empty list to store dataframes
  df_list = list()

  print(imdb_code)

  # lez go!
  for (season in seasons) {

    print(season)

    # define url, using glue for combining strings
    base_url <- "https://www.imdb.com/title/"
    season_url <- glue::glue("{base_url}{imdb_code}/episodes?season={season}")

    # go get the html
    html <- xml2::read_html(season_url)

    # isolate the desired data
    show <- rvest::html_nodes(html, ".parent a") %>%
      rvest::html_text(trim = TRUE)
    title <- rvest::html_nodes(html, "#episodes_content strong a") %>%
      rvest::html_text(trim = TRUE)
    rating <- rvest::html_nodes(html, ".ipl-rating-star.small .ipl-rating-star__rating") %>%
      rvest::html_text(trim = TRUE) %>%
      as.numeric()
    votes <- rvest::html_nodes(html, ".ipl-rating-star__total-votes") %>%
      rvest::html_text(trim = TRUE) %>%
      readr::parse_number() # this saved the day! super helpful readr function
    air_date <- rvest::html_nodes(html, ".airdate") %>%
      rvest::html_text(trim = TRUE) %>%
      stringr::str_remove("[.]") %>%  # remove periods (May doesn't have a period like the rest: Apr., Oct.)
      readr::parse_date("%d %b %Y")

    # make a tibble for each season
    df <- tibble::tibble(show, air_date, title, rating, votes) %>%
      dplyr::mutate(season = season,
             episode = seq(1, nrow(.))) %>%
      dplyr::select(show, season:episode, everything())

    # add to list
    df_list[[season]] <- df
  }

  # smoosh the list into one tibble
  show_run <- dplyr::bind_rows(df_list)

  return(show_run)

}
zachbogart/imdbScraper documentation built on Jan. 1, 2021, 1:49 p.m.