#' Function to scrape Craigslist listings
#'
#' \code{upgo_scrape_cl} scrapes all data from Craigslist rental
#' listings in a given city.
#'
#' @param city A character scalar indicating the city to scrape. It must
#' correspond to the domain prefix used for a city's Craigslist site (e.g.
#' "sfbay" for the San Francisco Bay area).
#' @param old_results A data frame. If the output of a previous run of
#' \code{upgo_scrape_cl} is supplied, listings previously scraped will
#' be incorporated into the new results.
#' @param old_results_add A logical scalar. Should the function add old results
#' to the table, or skip them (default)?
#' @param recovery A logical vector. Should the function attempt to recover
#' results from a previous, unsuccessful function call?
#' @param proxies Character vector of IPs to use for proxy connections.
#' @param quiet A logical vector. Should the function execute quietly, or should
#' it return status updates throughout the function (default)?
#' @return A table with one row per listing scraped.
#' @importFrom crayon cyan italic silver
#' @importFrom dplyr %>%
#' @importFrom progressr progressor with_progress
#' @importFrom purrr map_dfr
#' @importFrom stringr str_extract
#' @export
upgo_scrape_cl <- function(city, old_results = NULL, old_results_add = FALSE,
recovery = FALSE, proxies = NULL, quiet = FALSE) {
### SETUP ####################################################################
## Check for packages --------------------------------------------------------
helper_require("rvest")
helper_require("xml2")
## Initialize variables ------------------------------------------------------
# Quiet R CMD check
.temp_url_list <- .temp_listings <- .temp_results <- .temp_finished_flag <-
NULL
# Prepare for parallel processing
if (requireNamespace("future", quietly = TRUE)) {
doFuture::registerDoFuture()
}
## Validate city argument ----------------------------------------------------
if (!all(city %in% possible_cities)) {
invalid_cities <- city[!city %in% possible_cities]
if (length(invalid_cities) == 1) {
stop("Invalid input to `city` argument (", invalid_cities,
"). Inputs must correspond to ***.craigslist.org subdomains.")
}
if (length(invalid_cities) > 1) {
stop("Invalid inputs to `city` argument (",
do.call(paste, as.list(c(invalid_cities, sep = ", "))),
"). Inputs must correspond to ***.craigslist.org subdomains.")
}
}
## Restore object if recovery == TRUE ----------------------------------------
if (recovery) {
finished_flag <- get(".temp_finished_flag", envir = .GlobalEnv)
# Check if finished_flag vector is the right length
if (length(finished_flag) != length(city)) {
stop("The recovery data does not match the inputs. ",
"Try re-running with recovery = FALSE.")
}
url_list <- get(".temp_url_list", envir = .GlobalEnv)
listings <- get(".temp_listings", envir = .GlobalEnv)
results <- get(".temp_results", envir = .GlobalEnv)
## Initialize objects if recovery == FALSE -----------------------------------
} else {
url_list <- vector("list", length(city))
listings <- vector("list", length(city))
results <- vector("list", length(city))
finished_flag <- map(seq_along(city), ~FALSE)
}
## Set on.exit expression ----------------------------------------------------
on.exit({
.temp_url_list <<- url_list
.temp_listings <<- listings
.temp_finished_flag <<- finished_flag
.temp_results <<- results
})
## Initialize proxies --------------------------------------------------------
if (!missing(proxies)) {
# Put proxy list in .upgo_env so it can be accessed from child functions
.upgo_env$proxy_list <- proxies
on.exit(rlang::env_unbind(.upgo_env, "proxy_list"), add = TRUE)
}
### MAIN SCRAPING LOOP #######################################################
for (n in seq_along(city)) {
## Get city_name -----------------------------------------------------------
city_name <- city[[n]]
## Skip city if already finished in recovery data --------------------------
if (finished_flag[[n]]) {
if (!quiet) message(silver(bold(glue(
"({n}/{length(city)}) ",
"Recovery data for '{city_name}' detected; skipping scrape."))))
next
}
if (!quiet) message(silver(bold(glue(
"({n}/{length(city)}) ",
"Scraping Craigslist rental listings in '{city_name}' ",
"with {helper_plan()}."))))
## Retrieve URLs -----------------------------------------------------------
start_time <- Sys.time()
# Temporarily the same
if (!quiet) {
url_list[[n]] <- helper_urls_cl(city_name, proxies)
} else {
url_list[[n]] <- helper_urls_cl(city_name, proxies)
}
# Clean up
total_time <- Sys.time() - start_time
time_final_1 <- substr(total_time, 1, 4)
time_final_2 <- attr(total_time, 'units')
if (!quiet) {
message(silver(length(url_list[[n]]), "listing URLs scraped in "),
cyan(time_final_1, time_final_2), silver("."))
}
## Process duplicate listings if old_results is provided -------------------
if (!missing(old_results)) {
if (old_results_add) {
active_urls <-
old_results %>%
# Find listings which were active in the last scrape
dplyr::filter(.data$scraped == max(.data$scraped),
.data$city == city_name) %>%
# But remove duplicates with newly scraped URLs
dplyr::filter(!url %in% url_list[[n]]) %>%
dplyr::pull(url)
url_list[[n]] <-
c(url_list[[n]], active_urls)
if (!quiet) message(crayon::silver(glue::glue(
"{length(active_urls)} previously scraped listings to be checked.")))
} else {
updated_results <-
old_results %>%
filter(city == city_name, url %in% url_list[[n]]) %>%
mutate(scraped = Sys.Date())
old_results <-
old_results %>%
dplyr::anti_join(updated_results, by = "id") %>%
bind_rows(updated_results)
url_list[[n]] <-
url_list[[n]][!url_list[[n]] %in% updated_results$url]
# Advance loop early if there are no new listings
if (length(url_list[[n]]) == 0) {
listings[[n]] <- NULL
results[[n]] <- old_results
if (!quiet) message(silver(glue(
"{nrow(updated_results)} previously scraped listings still active. ",
"No new results to scrape.")))
next
}
if (!quiet) message(silver(glue(
"{nrow(updated_results)} previously scraped listings still active.")))
}
}
## Scrape individual pages -------------------------------------------------
start_time <- Sys.time()
handler_upgo("Scraping listing")
# Temporarily the same
if (!quiet) {
listings[[n]] <- helper_download_listing(
paste0(url_list[[n]], "?lang=en&cc=us"), proxies)
} else {
listings[[n]] <- helper_download_listing(
paste0(url_list[[n]], "?lang=en&cc=us"), proxies)
}
# Clean up
total_time <- Sys.time() - start_time
time_final_1 <- substr(total_time, 1, 4)
time_final_2 <- attr(total_time, 'units')
if (!quiet) {
message(silver(length(listings[[n]]), "listings scraped in "),
cyan(time_final_1, time_final_2), silver("."))
}
## Parse HTML --------------------------------------------------------------
handler_upgo("Parsing result")
start_time <- Sys.time()
if (requireNamespace("future", quietly = TRUE)) {
if (!quiet) {
with_progress({
pb <- progressor(along = listings[[n]])
results[[n]] <-
furrr::future_map2_dfr(listings[[n]], url_list[[n]], ~{
pb()
helper_parse_cl(.x, .y, city_name)
})})
} else {
results[[n]] <-
furrr::future_map2_dfr(listings[[n]], url_list[[n]], helper_parse_cl,
city_name)
}
} else {
if (!quiet) {
with_progress({
pb <- progressor(along = listings[[n]])
results[[n]] <-
purrr::map2_dfr(listings[[n]], url_list[[n]], ~{
pb()
helper_parse_cl(.x, .y, city_name)
})})
} else {
results[[n]] <-
purrr::map2_dfr(listings[[n]], url_list[[n]], helper_parse_cl,
city_name)
}
}
## Clean up ----------------------------------------------------------------
total_time <- Sys.time() - start_time
time_final_1 <- substr(total_time, 1, 4)
time_final_2 <- attr(total_time, 'units')
if (!quiet) {
message(silver(nrow(results[[n]]), "listings parsed in "),
cyan(time_final_1, time_final_2), silver("."))
}
## Rbind with old_results if present, then arrange -------------------------
if (!missing(old_results)) {
results[[n]] <-
old_results %>%
filter(.data$city == city_name) %>%
bind_rows(results[[n]])
}
results[[n]] <-
results[[n]] %>%
arrange(.data$id)
## Set finished_flag upon successfully completing a city -------------------
finished_flag[[n]] <- TRUE
}
### RBIND AND RETURN RESULTS #################################################
results <- bind_rows(results)
results <- dplyr::mutate(
results, id = if_else(str_starts(id, "cl-"), id, paste0("cl-", id)))
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.