R/scrapeExercises.R

#' Scrape exercises from internet
#'
#' This function scrapes all the exercises on bodybuilding.com. Data is stored in a dataframe, containing exercise name, muscle group, grade and rating. It is possible that some exercises are not rated.
#'
#' @param pages number of pages to scrape. Defaults to 73 as it is the current max.
#'
#' @return data.frame
#'
#' @export
#' @importFrom rvest html_nodes html_text
#' @importFrom xml2 read_html
#' @importFrom tibble tibble
#' @importFrom dplyr mutate case_when bind_rows
#'
#' @examples
#' df <- scrapeExercises(pages = 5)
#' head(df)
scrapeExercises <- function(pages = 73) {
  url_base <- "https://www.bodybuilding.com/exercises/finder/"

  exercise.df <- NULL
  for (i in 1:pages) {
    cat(".")

    read <- xml2::read_html(paste0(url_base, i))
    ex.df <- tibble::tibble(
      exercises = rvest::html_nodes(read, ".ExResult-resultsHeading") %>%
        rvest::html_text() %>%
        trimws(),

      muscles = rvest::html_nodes(read, ".ExResult-muscleTargeted") %>%
        rvest::html_text() %>%
        trimws() %>%
        gsub("Muscle Targeted:\n", "", .) %>%
        gsub("\n", "", .) %>%
        gsub(" ", "", .),

      # Induces errors because it can only find Excellent exercises.
      # rating = html_nodes(read, '.ExRating-description--Excellent') %>%
      #   html_text() %>%
      #   trimws(),

      grade = rvest::html_nodes(read, ".ExRating-badge") %>%
        rvest::html_text() %>%
        trimws()
    )
    exercise.df <- exercise.df %>%
      bind_rows(ex.df)
  }
  # Fix rating
  exercise.df <- exercise.df %>%
    dplyr::mutate(grade = as.numeric(grade)) %>%
    dplyr::mutate(
      rating = dplyr::case_when(
        grade >= 8.0 ~ "Excellent",
        grade >= 5.0 ~ "Good",
        grade <= 4.9 ~ "Average",
        is.na(grade) ~ "Not yet rated"
      ) %>%
        factor(c("Excellent", "Good", "Average", "Not yet rated"))
    ) %>%
    dplyr::mutate(muscles = factor(muscles)) %>%
    janitor::clean_names(., case = "lower_camel")
}
MarijnJABoer/HeavySetR documentation built on May 22, 2019, 5:31 p.m.