inst/updates/scraper.R

#' Update NASCAR Series Data
#'
#' Consolidated scraper for all three NASCAR series (Cup, NXS, Truck).
#' Downloads current data from R2, scrapes new races from DriverAverages.com,
#' combines the results, and uploads back to R2.
#'
#' Uses httr2 for HTTP requests with built-in retry logic, placeholder
#' detection, and track info merging.
#'
#' @param series Character. One of "cup", "nxs", or "truck".
#'
#' @return Invisible NULL. Uploads updated data to R2 as side effect.

update_nascar_series <- function(series) {
  # Validate series parameter
  series <- tolower(series)
  if (!series %in% c("cup", "nxs", "truck")) {
    stop(
      "Invalid series '", series,
      "'. Must be one of: 'cup', 'nxs', 'truck'"
    )
  }

  # Series-specific configuration
  config <- list(
    cup = list(
      base_url = "https://www.driveraverages.com/nascar/",
      r2_key = "cup_series",
      track_info_file = "inst/updates/cup_track_info.rda",
      track_object = "cup_track_info",
      series_name = "Cup"
    ),
    nxs = list(
      base_url = "https://www.driveraverages.com/nascar_xfinityseries/",
      r2_key = "nxs_series",
      track_info_file = "inst/updates/nxs_track_info.rda",
      track_object = "nxs_track_info",
      series_name = "NXS"
    ),
    truck = list(
      base_url = "https://www.driveraverages.com/nascar_truckseries/",
      r2_key = "truck_series",
      track_info_file = "inst/updates/truck_track_info.rda",
      track_object = "truck_track_info",
      series_name = "Truck"
    )
  )

  cfg <- config[[series]]
  message("Updating ", cfg$series_name, " Series data...")

  # Helper: fetch a page with httr2 built-in retry
  get_page <- function(url) {
    tryCatch(
      {
        httr2::request(url) |>
          httr2::req_user_agent(
            "nascaR.data R package (https://github.com/kyleGrealis/nascaR.data)"
          ) |>
          httr2::req_timeout(30) |>
          httr2::req_retry(
            max_tries = 5,
            backoff = ~ 3 * 1.5^.x
          ) |>
          httr2::req_perform() |>
          httr2::resp_body_string() |>
          rvest::read_html()
      },
      error = function(e) {
        stop(
          "[", cfg$series_name, "] Failed to retrieve ", url,
          ": ", conditionMessage(e)
        )
      }
    )
  }

  # Download existing data from R2
  r2_url <- paste0(
    "https://nascar.kylegrealis.com/", cfg$r2_key, ".parquet"
  )
  message("Downloading current data from R2...")

  existing_data <- tryCatch(
    arrow::read_parquet(r2_url),
    error = function(e) {
      stop(
        "[", cfg$series_name,
        "] Failed to download from R2: ",
        conditionMessage(e)
      )
    }
  )

  message(
    "  Loaded ", format(nrow(existing_data), big.mark = ","),
    " existing rows"
  )

  # Load track info
  if (file.exists(cfg$track_info_file)) {
    load(cfg$track_info_file)
    track_info <- get(cfg$track_object)
  } else {
    message("No track info found. Proceeding without track data...")
    track_info <- data.frame(
      Track = character(),
      Length = numeric(),
      Surface = character(),
      stringsAsFactors = FALSE
    )
  }

  # Determine current year and last completed race
  current_year <- as.integer(format(Sys.Date(), "%Y"))

  if (current_year %in% existing_data$Season) {
    last_completed_race <- max(
      existing_data$Race[existing_data$Season == current_year]
    )
  } else {
    last_completed_race <- 0L
    message(
      "No ", cfg$series_name,
      " races completed yet for ", current_year, "."
    )
  }

  # Check for placeholder row from DriverAverages.com
  # They sometimes add a single-row placeholder for next week's race
  # with mostly empty/NA values, causing stale data detection
  placeholder_check <- existing_data |>
    dplyr::filter(
      Season == current_year,
      Race == last_completed_race
    )

  is_placeholder <- nrow(placeholder_check) == 1 &&
    (all(is.na(placeholder_check$Laps) | placeholder_check$Laps == 0))

  if (is_placeholder) {
    existing_data <- existing_data |>
      dplyr::filter(
        !(Season == current_year & Race == last_completed_race)
      )
    last_completed_race <- last_completed_race - 1L
    message("Removed placeholder row(s) for incomplete race")
  }

  # Get race links for current season
  season_url <- paste0(cfg$base_url, "year.php?yr_id=", current_year)

  new_links <- get_page(season_url) |>
    rvest::html_elements("div#Div2Nav ul a") |>
    rvest::html_attr("href") |>
    purrr::keep(~ stringr::str_detect(., stringr::fixed("race.php?")))

  message(
    "Found ", length(new_links), " total ", cfg$series_name,
    " races for ", current_year
  )
  message("Last completed race: ", last_completed_race)

  # Check if up to date
  if (length(new_links) <= last_completed_race) {
    message(
      cfg$series_name, " Series is up-to-date with ",
      last_completed_race, " races"
    )
    return(invisible())
  }

  # Only process new races
  new_links <- new_links[(last_completed_race + 1):length(new_links)]

  if (length(new_links) == 0) {
    message("No new ", cfg$series_name, " races to process")
    return(invisible())
  }

  message("Processing ", length(new_links), " new race(s)...")

  # Process each new race with rate limiting and proper indexing
  new_results <- purrr::imap_dfr(new_links, function(link, race_index) {
    tryCatch({
      race_number <- as.integer(last_completed_race + race_index)

      # Rate limiting: small delay between requests
      Sys.sleep(0.5)

      page <- get_page(paste0(cfg$base_url, link))

      # Extract race details
      details <- page |>
        rvest::html_element("td.td-left span.td-bold") |>
        rvest::html_text2()

      parts <- stringr::str_split(details, "\n")[[1]]
      race_name <- parts[1]
      track_name <- parts[2]

      message(
        "  [Race ", race_number, "] Processing: ",
        track_name
      )

      # Extract race table
      race_table <- page |>
        rvest::html_table(header = TRUE) |>
        purrr::pluck(3)

      # Validate extracted table has expected columns
      if (!is.null(race_table) && nrow(race_table) > 0) {
        expected <- c("Finish", "Start", "Driver")
        if (!all(expected %in% names(race_table))) {
          message(
            "  [Race ", race_number,
            "] Skipping: unexpected table structure"
          )
          return(NULL)
        }
      }

      if (is.null(race_table) || nrow(race_table) == 0) {
        message(
          "  [Race ", race_number,
          "] Skipping: empty or missing table"
        )
        return(NULL)
      }

      # Clean and format data with explicit type coercion
      # S3 is Cup-only (2024+); NXS/Truck don't have it
      has_s3 <- "S3" %in% names(race_table)

      result <- race_table |>
        {\(d) {
          if ("#" %in% names(d)) d <- dplyr::rename(d, Car = `#`)
          d
        }}() |>
        dplyr::mutate(
          Season = current_year,
          Race = race_number,
          Car = stringr::str_remove(Car, "#"),
          Track = track_name,
          Name = race_name,
          Finish = as.integer(Finish),
          Start = as.integer(Start),
          Pts = as.integer(Pts),
          Laps = as.integer(Laps),
          Led = as.integer(Led),
          S1 = as.integer(S1),
          S2 = as.integer(S2),
          S3 = if (has_s3) as.integer(S3) else NA_integer_,
          Rating = as.numeric(Rating),
          Win = dplyr::if_else(Finish == 1L, 1, 0)
        ) |>
        dplyr::left_join(
          track_info |> dplyr::select(Track, Length, Surface),
          by = "Track"
        ) |>
        dplyr::select(
          Season, Race, Track, Name, Length, Surface,
          Finish, Start, Car, Driver, Make, Pts,
          Laps, Led, Status, Team, S1, S2, S3,
          Rating, Win
        )

      # Warn about missing track info
      if (is.na(result$Length[1]) || is.na(result$Surface[1])) {
        message(
          "  [Race ", race_number,
          "] WARNING: '", track_name,
          "' not in track_info (Length/Surface are NA)"
        )
      }

      message("  Processed ", nrow(result), " driver results")
      result
    }, error = function(e) {
      message(
        "  [Race ", race_number,
        "] ERROR: ", conditionMessage(e)
      )
      NULL
    })
  })

  # Combine and upload to R2
  if (nrow(new_results) > 0) {
    updated_data <- dplyr::bind_rows(existing_data, new_results)

    n_new_races <- dplyr::n_distinct(new_results$Race)
    n_new_results <- nrow(new_results)
    message(
      "Added ", n_new_races, " new race(s) with ",
      n_new_results, " total results"
    )

    # Sanity check before upload
    if (nrow(updated_data) < nrow(existing_data)) {
      stop(
        "[", cfg$series_name,
        "] Data shrunk from ", nrow(existing_data),
        " to ", nrow(updated_data), " rows. Aborting upload."
      )
    }

    # Upload combined data to R2
    message("Uploading ", cfg$r2_key, " to R2...")
    nascar_r2_upload(updated_data, cfg$r2_key)
    message("  -> uploaded ", cfg$r2_key, ".parquet to R2")
  } else {
    message("No new ", cfg$series_name, " race data found")
  }

  invisible()
}

Try the nascaR.data package in your browser

Any scripts or data that you put into this service are public.

nascaR.data documentation built on Feb. 14, 2026, 5:07 p.m.