#' Clean web-scraped data.
#'
#' A function to clean web-scraped data given the tibble containing raw data .
#'
#'
#' @param scraped_df A tibble containing web-scraped data like listing url,
#' price and house type.
#'
#' @return tibble A cleaned tibble containing information like listing url,
#' price, number of bedrooms, area in sqft, and city.
#' @export
#' @examples
#' data_cleaner(tibble::tibble(
#' "price" = c("$1,300", "$1,200"),
#' "house_type" = c("1br-600ft2-", "2br-"),
#' "listing_url" = c(
#' "https://vancouver.craigslist.org/bnc/apa/d/burnaby-must-see-1br-suite/7282955370.html",
#' "https://vancouver.craigslist.org/rds/apa/d/surrey-bedroom-basement-for-rent/7273877138.html"
#' )
#' ))
data_cleaner <- function(scraped_df) {
if (!tibble::is_tibble(scraped_df)) {
stop("Cannot do the data cleaning for a non-tibble input")
} else if (dim(scraped_df)[1] == 0) {
stop("Cannot do the data cleaning for a empty tibble")
} else if (!"price" %in% colnames(scraped_df)) {
stop("The price column is missing")
} else if (!"house_type" %in% colnames(scraped_df)) {
stop("The house_type column is missing")
} else if (!"listing_url" %in% colnames(scraped_df)) {
stop("The listing_url column is missing")
} else {
cleaned_df <- scraped_df
# Create the list of cities in metro vancouver
city_list <- c(
"Bowen Island", "Burnaby", "Coquitlam", "Delta", "Langley", "Maple Ridge",
"New Westminster", "North Vancouver", "Pitt Meadows", "Port Coquitlam", "Port Moody",
"Richmond", "Surrey", "Vancouver", "West Vancouver", "White Rock"
)
# convert the price column into numeric data types
cleaned_df$price <- as.character(cleaned_df$price)
cleaned_df$price <- stringr::str_remove_all(cleaned_df$price, "[$,]")
cleaned_df$price <- as.integer(cleaned_df$price)
# extract the information about the number of bedroom of the housing
cleaned_df$num_bedroom <- stringr::str_extract_all(cleaned_df$house_type, "[0-9]+[b]{1}[r]{1}")
cleaned_df$num_bedroom [length(cleaned_df$num_bedroom) == 0] <- NA
cleaned_df$num_bedroom <- as.character(cleaned_df$num_bedroom)
cleaned_df$num_bedroom <- stringr::str_remove_all(cleaned_df$num_bedroom, "[br]")
cleaned_df$num_bedroom <- suppressWarnings(as.integer(cleaned_df$num_bedroom))
# extract the information about the area of the housing
cleaned_df$area_sqft <- stringr::str_extract_all(cleaned_df$house_type, "[0-9]+[f]{1}[t]{1}[2]{1}")
cleaned_df$area_sqft[length(cleaned_df$area_sqft) == 0] <- NA
cleaned_df$area_sqft <- as.character(cleaned_df$area_sqft)
cleaned_df$area_sqft <- stringr::str_remove_all(cleaned_df$area_sqft, "ft2")
cleaned_df$area_sqft <- suppressWarnings(as.integer(cleaned_df$area_sqft))
# extract the information about which city are the housing located in
cleaned_df$city <- stringr::str_extract_all(cleaned_df$listing_url, "[d]{1}[/]{1}[a-z]+[-]{1}")
cleaned_df$city <- stringr::str_remove_all(cleaned_df$city, "d/")
cleaned_df$city <- stringr::str_remove_all(cleaned_df$city, "-")
cleaned_df$city <- stringr::str_to_title(cleaned_df$city)
cleaned_df$city <- lapply(cleaned_df$city, function(x) x[which(x %in% city_list)])
cleaned_df$city <- tolower(cleaned_df$city)
cleaned_df$city[cleaned_df$city == "character(0)"] <- NA
# select the columns which are useful for future filterin
cleaned_df <- dplyr::select(cleaned_df, listing_url, price, num_bedroom, area_sqft, city)
}
return(cleaned_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.