# gdeltr2::load_needed_packages(c("jsonlite", "purrr", "tidyr", "glue", "stringr", "curl", "dplyr", "rvest", 'lubridate', "requestsR", "stringi"))
.curl_page <-
function(url) {
df_call <- generate_url_reference()
h <-
new_handle(
accept_encoding = NULL,
verbose = F,
useragent = df_call$urlReferer
)
page <-
curl(url, handle = h) %>%
read_lines() %>%
str_c(collapse = "") %>%
read_html()
page
}
.curl_json <-
function(url) {
df_call <- generate_url_reference()
h <-
new_handle(
accept_encoding = NULL,
verbose = F,
useragent = df_call$urlReferer
)
json_data <-
curl(url, handle = h) %>%
read_lines()
json_data
}
# munge -------------------------------------------------------------------
.munge_realtor <-
function(data) {
data <-
data %>%
mutate_if(is.character,
funs(ifelse(. == "", NA_character_, .))) %>%
dplyr::select(which(colMeans(is.na(.)) < 1))
num_names <-
data %>%
dplyr::select(
dplyr::matches(
"^area|^count[A-Z]|^price|^latitude|^longitude|^year|^index|^id[A-Z]|^number[A-Z]|^size"
)
) %>%
dplyr::select(-dplyr::matches("countyProperty")) %>%
names()
data <- data %>% select(-dplyr::matches("remove"))
if (num_names %>% length() > 0) {
data <-
data %>%
mutate_at(num_names,
funs(. %>% as.character() %>% parse_number())) %>%
suppressWarnings() %>% suppressMessages()
}
log_names <-
data %>% dplyr::select(dplyr::matches("^is|^has")) %>% names()
data <-
data %>%
mutate_at(log_names,
funs(as.logical))
if (data %>% has_name("slugLDP")) {
data <-
data %>%
mutate(urlListing = glue("https://www.realtor.com/{slugLDP}") %>% as.character()) %>%
dplyr::select(-slugLDP)
}
if (data %>% has_name("areaPropertySF") &&
data %>% has_name("priceDisplay")) {
data <-
data %>%
mutate(pricePerSFListing =
case_when( areaPropertySF > 0 ~
(priceDisplay / areaPropertySF) %>% round(digits = 2),
TRUE ~ NA_real_
))
}
if (data %>% has_name("areaPropertySF") &&
data %>% has_name("priceListing")) {
data <-
data %>%
mutate(pricePerSFListing =
case_when(
areaPropertySF > 0 ~ (priceListing / areaPropertySF) %>% round(digits = 2)
,
TRUE ~ NA_real_
))
}
if (data %>% has_name("nameBrokerage")) {
data <-
data %>%
mutate(nameBrokerage = nameBrokerage %>% str_to_upper())
data <-
data %>%
separate(
col = nameBrokerage,
into = c("nameBrokerage", "locationOffice"),
sep = "\\-"
) %>%
suppressWarnings() %>%
mutate_if(is.character,
str_trim)
}
date_names <-
names(data)[names(data) %>% str_detect("date[A-Z]")]
date_names <- date_names[!date_names %>% str_detect("datetime")]
if (date_names %>% length() > 0 ) {
data <- data %>%
mutate_at(date_names,
funs(. %>% mdy()))
}
datetime_names <-
data %>% dplyr::select(dplyr::matches("^datetime")) %>% names()
if (datetime_names %>% length() > 0 ) {
data <-
data %>%
mutate_at(datetime_names, funs(. %>% ymd_hms))
}
data
}
# dict --------------------------------------------------------------------
dictionary_css_page <-
function() {
tibble(
id = c(
"ra ra-status-sale",
"ra ra-price-per-sq-ft",
"ra ra-days-on-realtor",
"ra ra-property-type",
"ra ra-year-built",
"ra ra-home-style",
"data-advertiserid-agent",
"data-advertiserid-office",
"data-advertiserid-broker",
"data-listingid",
"data-omtag",
"data-page",
"data-producttype",
"data-proptype",
"data-propertytype",
"data-propertyid",
"data-status",
"data-url",
"data-hot-buy",
"id",
"itemscope",
"data-promotion_status",
"itemtype",
"data-price",
"data-product_name",
"data-baths_full",
"data-baths_half",
"data-baths",
"data-lot_size",
"data-leadfm_ab_test",
"add-lead-form-modal",
"is_cobroke",
"is_showcase",
"is_suppress_map_pin",
"is_address_suppressed",
"is_foreclosure",
"is_advantage_pro",
"show_contact_a_lender",
"lead_type",
"form_type",
"broker",
"streetAddress",
"addressLocality",
"addressRegion",
"postalCode",
"latitude",
"longitude",
"addressProperty",
"urlImage",
"property-meta-beds",
"property-meta-baths",
"property-meta-sqft",
"property-meta-lotsize",
"property-meta-garage",
"typeListing",
"addressPropertyFull",
"data-agentid",
"property-meta-build",
"statusListing",
"statusListingDetail",
"branding-agent-name",
"branding-agent-phone",
"branding-office-name",
"branding-office-phone",
"property-meta-bath",
"property-baths-count",
"data-featured", "data-type", "data-display_tcpa_message",
"name", "description", "priceCurrency", "price", "category",
"address", "startDate", "endDate",
"data_source",
"is_senior",
"is_featured",
"is_mls",
"is_cozy",
"groupBeds"
),
nameActual = c(
'statusListing',
'pricePerSF',
'countDaysOnRealtor',
'typePropertyDetail',
'yearBuilt',
'styleHome',
"idAgent",
"idOffice",
"idBroker",
"idListing",
"remove_omtag",
"remove_page",
"remove_producttype",
"remove_proptype",
"typeProperty",
"idProperty",
"statusProperty",
"slugLDP",
"slugHotBuy",
"idRealtor",
"typePropertySchema",
"statusPromotion",
"remove_schema",
"priceListing",
"slugProduct",
"countBathsFull",
"countBathsHalf",
"remove_baths",
"areaLotSize",
"remove_ab_tst",
"remove_form_modal",
"isCoBroke",
"isShowcased",
"isSuppressdPin",
"isAddressSuppress",
"isForeclosure",
"isAdvantagPro",
"remove_show_contact_a_lender",
"slugLead",
"slugForm",
"nameBrokerage",
"addressProperty",
"cityProperty",
"stateProperty",
"zipcodeProperty",
"latitudeProperty",
"longitudeProperty",
"addressProperty",
"urlImage",
"countBeds",
"countBaths",
"areaPropertySF",
"sizeLotAcres",
"typeGarage" ,
"typeListing",
"addressPropertyFull",
"idAgent",
"statusPropertyBuild",
"statusListing",
"statusListingDetail",
"nameAgent",
"phoneAgent",
"nameBrokerage",
"phoneBrokerage",
"countBaths",
"countBaths",
"isFeatured", "typeProperty", "hasMessageTCPA",
"nameListing", "descriptionListing", "currencyListing", "priceListing", "categoryListing",
"addressListing", "datetimeListingStart", "datetimeListingEnd",
"sourceData",
"isSeniorHome",
"isFeatured",
"isMLSListing",
"isCozy",
"groupBeds"
)
)
}
dictionary_realtor_names <-
function() {
tibble(
nameRealtor = c(
"is_swapped",
"tax_amount",
"price_per_sqft",
"median_listing_price",
"median_rent_price",
"for_sale_count",
"new_listing_count",
"open_house_count",
"recently_sold_count",
"foreclosure_count",
"price_reduced_count",
"ldp_url",
"photo_url",
"address_display",
"beds",
"baths",
"sqft_display",
"price_display",
"listing_id",
"property_id",
"status",
"saved_resource_id",
'full_address_display',
'address',
'city',
'state_code',
'postal_code',
'lat',
'lon',
'agent_id',
'property_type',
'days_on_realtor',
'baths_half',
'baths_full',
'neighborhood_highlight',
'price_for_highlight',
'age_on_market_or_realtor',
'search_flags.is_price_reduced',
'search_flags.is_new_listing',
'search_flags.is_pending',
'search_flags.is_contingent',
'url',
'location',
'slug',
'validatedLocation',
'state_id',
"zip",
"year",
"month",
"active_listing_count",
"hotmarket_rank",
"hotmarket_index",
"hotmarket_temperature",
"asking_to_sold_price_percent_change",
"sales_to_inventory_count_percent",
"sales_to_inventory_count_percent_market_type",
"median.age_days",
"median.listing_price",
"median.rental_listing_price",
"median.listing_price_sqft",
"median.closing_price",
"listing_count.for_sale",
"listing_count.rental",
"listing_count.sold",
'numberProperty',
'ldpUrl',
'source_id',
'isSaved',
'mprId',
'officeName',
'product_type',
'sort',
'isCoBroke',
'isEnhanced',
'isForeclosure',
'isPriceReduced',
'isAddressSuppressed',
'bed',
'bath',
'lotSize',
'photo',
'photoCount',
'price',
'propertyType',
'sqft',
'county',
'state',
'is_price_reduced',
'is_pending',
'is_contingent',
'is_new_listing',
'is_showcase',
'has_tour',
'has_video',
'has_realtor_logo',
'is_cobroke',
'is_coshow_product',
'is_foreclosure',
'is_new_plan',
'price_excludes_land',
'has_deals',
'is_new',
'is_costar',
'is_aptlist',
'is_address_suppressed',
'is_suppress_map_pin',
'is_suppress_map',
'is_advantage',
'is_advantage_brand',
'is_advantage_pro',
'is_essentials',
'is_basic',
'is_basic_opt_in',
'is_basic_opt_out',
'has_matterport',
'agentId',
'is_short_sale',
"school",
"sold_date",
"days_on_market",
"form_type",
"lead_type",
"show_contact_a_lender",
"show_veterans_united",
"sold_price_to_listing_price_diff",
"sold_price_difference_summary.is_lower",
"sold_price_difference_summary.price_display",
"form_type", "lead_type", "show_contact_a_lender", "show_veterans_united",
"has_private_showing", "is_pre_approved", "is_office_standard",
"is_mortgage_feature", "should_suppress_ads", "show_porch_link",
"is_new_construction", "should_suppress_listing_agent", "is_fios_supported",
"is_fios_gige", "is_fios_sfu", "is_tcpa_message_enabled",
"agent_name", "broker_name", "broker_location", "broker_number",
"broker_link", "mls_id", "mls_name", "mls_copy_right", "mls_abbreviation",
"office_contact",
"plan_id",
"long",
"street",
"street_suffix",
"description",
"type",
"type_display",
"lot_size",
"lot_size_units",
"lot_size_units_display",
"permalink",
"beds_display",
"listed_date",
"listed_date_display",
"listing_age",
"price_per_sqft_display",
"year_built",
"geo_slug",
"photo_count",
"advertiser_id",
"neighborhood",
"virtual_tour_url",
"products",
"status_display",
"product_name",
"schools_info",
"active_status",
"property_status_display",
"office_advertiser_id",
"property_status",
"hoa_fee",
"mapUrl",
"nameStateProperty",
"garage", "agent_advertiser_id","broker_advertiser_id",
"stories", "list_date", "prop_type", "lot_size_display",
"matterport_url",
"street_direction",
"id", "rental_estimate",
"house_size",
"heating",
"cooling",
"date_updated",
"sold_times",
"sold_times_text",
"history_length",
"history_length_text",
"href",
"agent_photo",
"agent_profile_link",
"rental_estimate_moderate",
"value_per_bedroom_moderate",
"closing_price", "listing_price_sqft", "listing_price", "age_days",
"rental_listing_price",
"for_sale", "sold", "rental",
"data_source",
"is_senior",
"is_featured",
"is_mls",
"is_cozy",
"groupBeds",
"source",
"name",
"listing_status",
"toll_free_number",
"price_min",
"price_max",
"beds_min",
"beds_max",
"baths_min",
"baths_max",
"baths_display",
"sqft_min",
"sqft_max",
"community_id",
"save_aware",
"comment",
"deposit",
"deposit_display",
"availability_text",
"available_date",
"section",
"area",
"area_type",
"local_url",
"median_listing_price_display",
"median_sales_price_display",
"median_rent_price_display",
"median_sales_price",
"median_days_on_market",
"management_name",
"community_name",
"prop_status",
"lower", "txt", "yearData", "rental_estimate_display",
"fips_code", "property_insights", "real_tip",
"amountClosingPriceTownHouseCondo",
"amountListingPriceTownHouseCondo",
"amountClosingPriceSingleFamily",
"amountListingPriceSingleFamily",
"amountClosingPriceMultifamily",
"amountListingPriceMultifamily"
),
nameActual = c(
"isSwapped",
"amountTax",
"pricePerSF",
"priceListingMedian",
"priceRentMedian",
"countForSale",
"countNewListings",
"countOpenHouse",
"countSoldRecently",
"countForeclosures",
"countPriceReduced",
"slugLDP",
"urlPhoto",
"addressDisplay",
"countBeds",
"countBaths",
"areaPropertySFDisplay",
"remove_priceListingDisplay",
"idListing",
"idProperty",
"statusListing",
"idResouceSaved",
'addressDisplayFull',
'addressProperty',
'cityProperty',
'stateProperty',
'zipcodeProperty',
'latitudeProperty',
'longitudeProperty',
'idAgent',
'typeProperty',
'countDaysOnRealtor',
'countBathsHalf',
'countBathsFull',
'slugNeighborhoodHighlight',
'priceHighlight',
'remove',
'isPriceReduced',
'isListingNew',
'isPendingSale',
'isContingent',
'slugLDP',
'nameLocation',
'slugLocation',
'nameLocationValidated',
'slugState',
"zipcodeProperty",
"yearData",
"monthData",
"countActiveListings",
"rankMarketHot",
"indexMarketTemperature",
"temperatureMarket",
"pctSoldPriceToAsk",
"pctSalesToInventory",
"typeSalesToInventoryFavor",
"countDaysMarketMedian",
"priceListingMedian",
"priceRentMedian",
"pricePerSFMedian",
"priceClosingMedian",
"countForSale",
"countRental",
"countSold",
'numberProperty',
'slugLDP',
'slugSource',
'isSaved',
'idMPR',
'nameBrokerage',
'typeListing',
'numberSort',
'isCoBroke',
'isEnhanced',
'remove_foreclosure',
'isPriceReduced',
'remove_address_suppressed',
'countBeds',
'countBaths',
'areaLotSF',
'urlPhoto',
'countPhotos',
'priceListing',
'typeProperty',
'areaPropertySF',
'countyProperty',
'stateProperty',
'remove_reduced',
'isPending',
'isContingent',
'isListingNew',
'isShowcased',
'hasTour',
'hasVideo',
'hasRealtorLogo',
'remove_cobroke',
'isCoShow',
'isForeclosure',
'isPlanNew',
'isPriceExLand',
'hasDeals',
'isNew',
'isCostar',
'isAptList',
'isAddressSuppressed',
'isSuppressMapPin',
'isSuppressMap',
'isAdvantage',
'isAdvantageBrand',
'isAdvantagePro',
'isEssentials',
'isBasic',
'isBasicOptIn',
'isBasicOptOut',
'hasMatterPort',
'idAgent',
'isShortSale',
"nameSchool",
"dateSold",
"countDaysOnMarket",
"typeForm",
"typeLead",
"hasLenderContact",
"hasVeternsUnited",
"amountDifferenceSoldToListing",
"isSalePriceLowerThanListing",
"amountDifferenceSoldToListingDisplay",
"typeForm", "typeLead", "hasLenderContact", "hasUnitedVeterans",
"hasPrivateShowings", "isPreApproved", "isOfficeStandard",
"isMorgageFeatured", "hasAdsSuppressed", "hasPorchLink",
"isNewConstruction", "hasListingAgentSuppressed", "isFiosSupported",
"isFiosGIGE", "isFiosSFU", "hasMessageTCPA",
"nameAgent", "nameBrokerage", "locationBrokerage", "telephoneBrokerage",
"urlBrokerage", "idMLS", "nameMLS", "copyrightMLS", "slugMLS",
"telephoneOffice",
"idPlan",
"longitudeProperty",
"numberStreet",
"suffixStreet",
"descriptionListing",
"remove_typeListing",
"typeUnit",
"areaLotSF",
"typeUnitsLot",
"remove_lot_size_units_display",
"slugLDP",
"remove_beds_display",
"datetimeListed",
"remove_listed_date_display",
"countDaysListed",
"remove_price_per_sqft_display",
"yearBuilt",
"slugGEO",
"countPhotos",
"idAdvertister",
"nameNeighborhood",
"urlVirtualTour",
"remove_products",
"remove_status_display",
"nameProduce",
"descriptionSchool",
"statusActive",
"remove_property_status_display",
"idAdvertiserOffice",
"statusProperty",
"amountHOA",
"urlGoogleMap",
"nameStateProperty",
"hasGarage", "idAgentAdvertiser","idBrokerageAdvertiser",
"countStories", "datetimeListed", "typeProperty", "remove_lot_size_display",
"urlMatterport",
"directionStreet",
"idProperty", "amountRentEstimated",
"typeSizeHouse",
"typeHeating",
"typeCooling",
"dateUpdated",
"countSales",
"remove_sales",
"lengthHistory",
"remove_history",
"urlPhotoPrimary",
"urlAgentPhoto",
"slugAgentProfile" ,
"rentEstimatedMonthlyModerate",
"rentPerBedroomMonthlyModerate",
"priceClosing", "priceListingPSF", "priceListing",
"countDaysListing",
"rentListing",
"countSaleListings",
"countSoldListings",
"countRentalListings",
"sourceData",
"isSeniorHome",
"isFeatured",
"isMLSListing",
"isCozy",
"groupBeds",
"sourceListng",
"nameListing",
"statusListing",
"telephoneListing",
"priceListingMin",
"priceListingMax",
"countBedsMin",
"countBedsMax",
"countBathsMin",
"countBathsMax",
"bathsDisplay",
"areaPropertySFMin",
"areaPropertySFMax",
"idCommunity",
"isSaveAware",
"commentListing",
"depositDescription",
"depositDescriptionDisplay",
"descriptionAvailability",
"dateAvailable",
"sectionSite",
"nameArea",
"typeArea",
"slugRealtorURL",
"remove_median_listing_price_display",
"remove_median_sales_price_display",
"priceRentMedian",
"priceSalesMedian",
"countDaysOnMarketMedian",
"nameManagementCompany",
"nameCommunity",
"statusProperty",
"isLowerEstimate", "displayValueEstimate", "yearData", "displayRentalEstimate",
"idFIPS", "descriptionPropertyInsights", "descriptionRealTip",
"amountClosingPriceTownHouseCondo",
"amountListingPriceTownHouseCondo",
"amountClosingPriceSingleFamily",
"amountListingPriceSingleFamily",
"amountClosingPriceMultifamily",
"amountListingPriceMultifamily"
)
)
}
# rates -------------------------------------------------------------------
# https://www.realtor.com/mrtg_handler/get_trends_data
#' Mortgage Rates
#'
#' Returns a variety of
#' interest rates for various
#' mortgage types.
#'
#' @param return_wide if \code{TRUE} widens data and removes duration and benchmark variables
#'
#' @return a \code{tibble}
#' @export
#' @family interest rates
#' @examples
#' mortgage_rates(return_wide = F)
mortgage_rates <-
function(return_wide = F) {
data <-
"https://www.realtor.com/mrtg_handler/get_trends_data" %>%
fromJSON(flatten = T, simplifyDataFrame = T) %>%
.$rate_trends %>%
dplyr::as_tibble()
df_types <-
tibble(
typeRate = c(
'pct30YearFixed',
'pct15YearFixed',
'pct5OneARM',
'pct20YearFixed',
'pct30YearFixedFHA',
'pct30YearFixedVA',
'pct10YearFixed',
'pct7OneARM'
),
durationLoanMonths = c(360,
15 * 12,
5 * 12,
20 * 12,
360,
360,
120,
84),
typeBenchmark = c(
'fixed',
'fixed',
'floating',
'fixed',
'fixed',
'fixed',
'fixed',
'floating'
)
)
data <-
data %>%
set_names(
c(
"year",
"month",
"date",
'pct30YearFixed',
'pct15YearFixed',
'pct5OneARM',
'pct20YearFixed',
'pct30YearFixedFHA',
'pct30YearFixedVA',
'pct10YearFixed',
'pct7OneARM'
)
) %>%
unite(dateData, year, month, date, sep = "-") %>%
mutate(dateData = dateData %>% lubridate::ymd()) %>%
gather(typeRate, value, -dateData) %>%
mutate(value = value / 100) %>%
arrange(dateData)
data <-
data %>%
left_join(df_types, by = "typeRate") %>%
select(dateData, typeRate, durationLoanMonths, typeBenchmark, value)
if (return_wide) {
data <-
data %>%
select(-one_of(c(
"durationLoanMonths", "typeBenchmark"
))) %>%
spread(typeRate, value)
}
data
}
# market data -------------------------------------------------------------
# https://www.realtor.com/median_prices?city=Bethesda&state_code=MD
parse_location <-
function(location_name = c("Bethesda, MD")) {
if (location_name %>% str_detect("\\,")) {
is_city_state <-
location_name %>% str_count("\\,") == 1
if (is_city_state) {
location_slugs <-
location_name %>% str_split("\\,") %>% flatten_chr() %>% str_trim()
data <-
tibble(
locationSearch = location_name,
citySearch = location_slugs[[1]],
stateSearch = location_slugs[[2]]
)
return(data)
} else {
location_slugs <-
location_name %>% str_split("\\,") %>% flatten_chr() %>% str_trim()
data <-
tibble(
locationSearch = location_name,
streetOrNeighborhoodSearch = location_slugs[[1]],
citySearch = location_slugs[[2]],
stateSearch = location_slugs[[3]]
)
return(data)
}
}
tibble(locationSearch = location_name, zipcodeSearch = location_name)
}
.generate_market_url <-
function(location_name = c("Bethesda, MD")) {
df_loc_slug <-
location_name %>%
as.character() %>%
str_trim() %>%
parse_location()
city <-
df_loc_slug$citySearch
city_slug <-
city %>% URLencode()
state <- df_loc_slug$stateSearch
if (state %>% nchar() > 4) {
stop("Has to be the 2 digit state slug buddy")
}
url <-
glue('https://www.realtor.com/median_prices?city={city_slug}&state_code={state}') %>%
as.character()
data <-
df_loc_slug %>%
mutate(urlAPI = url) %>%
select(locationSearch, everything())
data
}
.generate_market_urls <-
function(locations = c("Bethesda, MD")) {
.generate_market_url_safe <-
possibly(.generate_market_url, tibble())
locations %>%
map_dfr(function(location_name) {
.generate_market_url_safe(location_name = location_name)
})
}
.parse_market_data_url <-
function(url = "https://www.realtor.com/median_prices?city=Bethesda&state_code=MD") {
data <-
url %>%
.curl_json() %>%
fromJSON(
simplifyVector = T,
simplifyDataFrame = T,
flatten = T
)
data <-
data$data %>%
flatten_df() %>%
dplyr::select(-dplyr::matches("_display"))
df_names <- dictionary_realtor_names()
actual_names <-
names(data) %>%
map_chr(function(name) {
df_row <- df_names %>% filter(nameRealtor == name)
if (df_row %>% nrow() == 0) {
glue("Missing {name}") %>%
cat(fill = T)
return(name)
}
df_row %>% pull(nameActual)
})
data <-
data %>%
set_names(actual_names) %>%
mutate(urlAPI = url)
if (data %>% has_name("pricePerSF")) {
data <-
data %>%
dplyr::rename(pricePerSFMedian = pricePerSF)
}
data
}
.parse_market_data_urls <-
function(urls = "https://www.realtor.com/median_prices?city=Bethesda&state_code=MD",
return_message = TRUE) {
.parse_market_data_url_safe <-
possibly(.parse_market_data_url, tibble())
urls %>%
map_dfr(function(url) {
if (return_message) {
glue("Parsing {url %>% str_replace_all('https://www.realtor.com/', '')}") %>%
cat(fill = T)
}
.parse_market_data_url_safe(url = url)
})
}
#' Median market statistics
#'
#' Returns summary median market information for the specified
#' locations.
#'
#' The location names must be a city and/or neighborhood bounded
#' by commas.
#'
#' @param locations vector of location , location name must contain
#' a city name and a comma ie "Brooklyn, NY"
#' @param return_message if \code{TRUE} returns a message
#' @param ... extra parameters
#'
#' @return a \code{tibble}
#' @export
#' @family market information
#' @examples
#' median_prices(locations = c("Greenwich, CT", "New London, CT", "Woodside, CA", "Park Slope, Brooklyn, NY"))
median_prices <-
function(locations = NULL,
return_message = TRUE,
...) {
if (locations %>% is_null()) {
stop("Please enter location names!!")
}
.generate_market_urls_safe <-
possibly(.generate_market_urls, tibble())
df_urls <-
.generate_market_urls_safe(locations = locations)
if (df_urls %>% nrow() == 0) {
"No results" %>% cat(fill = T)
return(invisible())
}
.parse_market_data_urls_safe <-
possibly(.parse_market_data_urls, tibble())
all_data <-
.parse_market_data_urls_safe(urls = df_urls$urlAPI, return_message = return_message)
if (all_data %>% nrow() == 0) {
"No results" %>% cat(fill = T)
return(invisible())
}
if (all_data %>% has_name("priceRentMedian")) {
all_data <-
all_data %>%
mutate(pctRentYield = (priceRentMedian * 12) / priceListingMedian)
}
if (all_data %>% has_name("priceListingMedian")) {
all_data <-
all_data %>%
mutate(
areaPropertySFMedian = (priceListingMedian / pricePerSFMedian) %>% round(digits = 2)
)
}
all_data <-
all_data %>%
left_join(df_urls, by = "urlAPI") %>%
select(one_of(
c(
'locationSearch',
"streetOrNeighborhoodSearch",
'citySearch',
'stateSearch',
'pricePerSFMedian',
'priceRentMedian',
'pctRentYield',
'areaPropertySFMedian'
)
),
everything()) %>%
suppressWarnings() %>%
remove_na() %>%
remove_columns()
all_data
}
# validate -----------------------------------------------------------------
.parse_validation_url <-
function(url = "https://www.realtor.com/validate_geo?location=Easton%2C+MD&retain_secondary_facets=true&include_zip=false&search_controller=Search%3A%3APropertiesController") {
data <-
url %>%
.curl_json() %>%
fromJSON(flatten = T, simplifyDataFrame = T) %>%
flatten_df() %>%
as_tibble()
data <-
data %>%
dplyr::select(-one_of("postal_code")) %>%
suppressWarnings()
df_names <- dictionary_realtor_names()
actual_names <-
names(data) %>%
map_chr(function(name) {
df_row <- df_names %>% filter(nameRealtor == name)
if (df_row %>% nrow() == 0) {
glue("Missing {name}") %>%
cat(fill = T)
return(name)
}
df_row %>% pull(nameActual)
})
data <-
data %>%
set_names(actual_names) %>%
mutate(slugLDP = slugLDP %>% substr(2, nchar(slugLDP))) %>%
mutate(urlAPI = url)
data %>%
.munge_realtor()
}
.parse_validation_urls <-
function(urls = "https://www.realtor.com/validate_geo?location=Easton%2C+MD&retain_secondary_facets=true&include_zip=false&search_controller=Search%3A%3APropertiesController",
return_message = TRUE) {
.parse_validation_url_safe <-
possibly(.parse_validation_url, tibble())
urls %>%
map_dfr(function(url) {
if (return_message) {
glue("Parsing {url %>% str_replace_all('https://www.realtor.com/', '')}") %>%
cat(fill = T)
}
.parse_validation_url_safe(url = url)
})
}
.generate_market_validation_url <-
function(location_name = c("Bethesda, MD")) {
location_name <- as.character(location_name)
has_comma <- location_name %>% str_detect("\\,")
if (has_comma) {
df_loc_slug <-
location_name %>%
str_trim() %>%
parse_location()
if (df_loc_slug %>% has_name("streetOrNeighborhoodSearch")) {
street_slug <-
df_loc_slug$streetOrNeighborhoodSearch %>%
str_replace_all("\\ ", "%20")
street_slug <- street_slug %>% str_c("_")
} else {
street_slug <- ''
}
city <-
df_loc_slug$citySearch
city_slug <-
city %>% URLencode()
state <-
df_loc_slug$stateSearch
if (state %>% nchar() > 4) {
stop("Has to be the 2 digit state slug buddy")
}
location_slug <-
glue("{street_slug}{city_slug}_{state}")
url <-
glue(
"https://www.realtor.com/validate_geo?location={location_slug}&retain_secondary_facets=true&include_zip=false&search_controller=Search%3A%3APropertiesController"
) %>%
as.character()
data <-
df_loc_slug %>%
mutate(locationSearch = location_name) %>%
mutate(urlAPI = url) %>%
select(locationSearch, everything())
} else {
data <- parse_location(location_name = location_name)
url <-
glue(
"https://www.realtor.com/validate_geo?location={location_name}&retain_secondary_facets=true&include_zip=false&search_controller=Search%3A%3APropertiesController"
) %>%
as.character()
data <-
data %>%
mutate(urlAPI = url)
}
data
}
.generate_market_validation_urls <-
function(locations = c("Bethesda, MD", 20852)) {
locations %>%
map_dfr(function(location_name) {
.generate_market_validation_url(location_name = location_name)
}) %>%
select(one_of(
c(
"locationSearch",
"streetOrNeighborhoodSearch",
"citySearch",
"stateSearch",
"zipcodeSearch",
"urlAPI"
)
),
everything()) %>%
suppressWarnings()
}
validate_locations <-
function(locations = NULL,
return_message = TRUE,
...) {
if (locations %>% is_null()) {
stop("Please enter location names!!")
}
.generate_market_validation_urls_safe <-
possibly(.generate_market_validation_urls, tibble())
df_urls <-
.generate_market_validation_urls_safe(locations = locations)
if (df_urls %>% nrow() == 0) {
"No results" %>% cat(fill = T)
return(invisible())
}
.parse_validation_urls_safe <-
possibly(.parse_validation_urls, tibble())
all_data <-
.parse_validation_urls_safe(urls = df_urls$urlAPI, return_message = return_message)
if (all_data %>% nrow() == 0) {
"No results" %>% cat(fill = T)
return(invisible())
}
all_data <-
all_data %>%
left_join(
df_urls %>% select(one_of(c(
"locationSearch", "urlAPI"
))), by = "urlAPI") %>%
select(one_of(
c(
"locationSearch",
"nameCity",
"stateSearch",
"zipcodeSearch"
)
),
everything()) %>%
suppressMessages() %>%
suppressWarnings() %>%
remove_na()
all_data
}
# market_vitality ---------------------------------------------------------
.parse_market_vitality_url <-
function(url = "https://www.realtor.com/home_page/vitality?location=Bethesda%2C+MD") {
json_data <-
url %>%
.curl_json() %>%
fromJSON(flatten = T, simplifyDataFrame = T)
df_names <- dictionary_realtor_names()
data <- json_data$data
data_listings <-
data$new_listings %>%
data.frame(stringsAsFactors = F) %>%
as_tibble()
actual_names <-
names(data_listings) %>%
map_chr(function(name) {
df_row <- df_names %>% filter(nameRealtor == name)
if (df_row %>% nrow() == 0) {
glue("Missing {name}") %>%
cat(fill = T)
return(name)
}
df_row %>% pull(nameActual)
})
data_listings <-
data_listings %>%
set_names(actual_names)
if (!data_listings %>% has_name("areaPropertySF") &&
data_listings %>% has_name("areaPropertySFDisplay")) {
data_listings <-
data_listings %>%
rename(areaPropertySF = areaPropertySFDisplay) %>%
mutate(areaPropertySF = parse_number(areaPropertySF))
}
data_listings <-
data_listings %>%
mutate(
priceListing = priceListing %>% as.character() %>% parse_number(),
areaPropertySF = areaPropertySF %>% as.character() %>% parse_number(),
urlListing = glue("https://www.realtor.com/{slugLDP}") %>% as.character()
) %>%
select(idListing, addressDisplay, statusListing,
everything()) %>%
remove_na() %>%
select(-one_of("slugLDP")) %>%
suppressWarnings() %>% suppressMessages()
data <-
data[c(
"for_sale_count",
"new_listing_count",
"open_house_count",
"recently_sold_count",
"foreclosure_count",
"price_reduced_count"
)] %>%
as_tibble()
actual_names <-
names(data) %>%
map_chr(function(name) {
df_row <- df_names %>% filter(nameRealtor == name)
if (df_row %>% nrow() == 0) {
glue("Missing {name}") %>%
cat(fill = T)
return(name)
}
df_row %>% pull(nameActual)
})
data <-
data %>%
set_names(actual_names) %>%
mutate(urlAPI = url,
dataListingsRecent = list(data_listings))
data
}
.parse_market_vitality_urls <-
function(urls = "https://www.realtor.com/home_page/vitality?location=Bethesda%2C+MD",
return_message = T) {
.parse_market_vitality_url_safe <-
possibly(.parse_market_vitality_url, tibble())
urls %>%
map_dfr(function(url) {
if (return_message) {
glue("Parsing {url %>% str_replace_all('https://www.realtor.com/', '')}") %>%
message()
}
.parse_market_vitality_url_safe(url = url)
})
}
.generate_market_vitality_url <-
function(location_name = c("Bethesda, MD")) {
if (location_name %>% str_detect("\\,")) {
df_loc_slug <-
location_name %>%
str_trim() %>%
parse_location()
city <-
df_loc_slug$citySearch
location_slug <-
location_name %>%
str_replace_all("\\,", "") %>%
URLencode()
state <- df_loc_slug$stateSearch
if (state %>% nchar() > 4) {
stop("Has to be the 2 digit state slug buddy")
}
url <-
glue('https://www.realtor.com/home_page/vitality?location={location_slug}') %>%
as.character()
data <-
df_loc_slug %>%
mutate(locationSearch = location_name) %>%
mutate(urlAPI = url) %>%
select(locationSearch, everything())
data <-
df_loc_slug %>%
mutate(urlAPI = url) %>%
select(locationSearch, everything())
data
} else {
data <- parse_location(location_name = location_name)
url <-
glue('https://www.realtor.com/home_page/vitality?location={location_name}') %>%
as.character()
data <-
data %>%
mutate(urlAPI = url)
}
data
}
.generate_market_vitality_urls <-
function(locations = c("Bethesda, MD")) {
.generate_market_vitality_url_safe <-
possibly(.generate_market_vitality_url, tibble())
locations %>%
map_dfr(function(location_name) {
.generate_market_vitality_url_safe(location_name = location_name)
})
}
#' Market vitality
#'
#' Acquires meta level information
#' for specified locations as of the
#' most recent time period
#'
#' @param locations vector of location names, location name must contain
#' a city name and a comma ie "Brooklyn, NY" or a zipcode
#'
#' @param return_message if \code{TRUE} returns a message
#' @param ... extra parameters
#'
#' @return a \code{tibble}
#' @export
#'
#' @examples
#' vitality(locations = c("La Jolla, CA", "Manhattan, NY", "Bethany, DE", "10016"))
vitality <-
function(locations = NULL,
return_message = TRUE,
...) {
if (locations %>% is_null()) {
stop("Please enter location names!!")
}
.generate_market_vitality_urls_safef <-
possibly(.generate_market_vitality_urls, tibble())
df_urls <-
.generate_market_vitality_urls_safef(locations = locations)
if (df_urls %>% nrow() == 0) {
"No results" %>% cat(fill = T)
return(invisible())
}
.parse_market_vitality_urls_safe <-
possibly(.parse_market_vitality_urls, tibble())
all_data <-
.parse_market_vitality_urls_safe(urls = df_urls$urlAPI, return_message = return_message)
if (all_data %>% nrow() == 0) {
"No results" %>% cat(fill = T)
return(invisible())
}
all_data <-
all_data %>%
left_join(df_urls %>% select(one_of(c(
"locationSearch", "urlAPI"
))),
by = "urlAPI"
) %>%
mutate(dateData = Sys.Date()) %>%
select(one_of(
c(
"dateData",
"locationSearch",
"nameCity",
"stateSearch",
"zipcodeSearch"
)
), everything()) %>%
suppressMessages() %>%
suppressWarnings() %>%
remove_na() %>%
remove_columns()
all_data
}
# api ---------------------------------------------------------------------
## https://www.realtor.com/browse_modules/homes_near_street?price=239000&postal_code=20816&median_price=1132000&type=homes_near_street&vis_id=6b0d44ae-f4d6-41f0-8feb-e6491ab43fe9&mcm_id=03714656198478469204855792545062287725&address=5301+Westbard+Cir+Apt+323&user_id=&city=Bethesda&coordinates=38.964595%2C-77.109017
.generate_near_url <-
function(latitude = 39.03665,
longitude = -77.11804) {
base_url <-
'https://www.realtor.com/browse_modules/homes_near_street?'
geo_slug <-
generate_coordinate_slug(latitude = latitude, longitude = longitude)
url <-
glue("{base_url}&coordinates={geo_slug}") %>% URLencode()
url
}
generate_coordinate_slug <-
function(latitude = 38.964419 ,
longitude = -77.113659) {
glue("{latitude},{longitude}") %>%
as.character()
}
.parse_realtor_api_near_url <-
function(url = "https://www.realtor.com/browse_modules/homes_near_street?postal_code=20816vis_id=6b0d44ae-f4d6-41f0-8feb-e6491ab43fe9&mcm_id=03714656198478469204855792545062287725&city=New+York&coordinates=40.74516%2C-73.97852") {
data <-
url %>%
.curl_json() %>%
fromJSON(
simplifyVector = T,
simplifyDataFrame = T,
flatten = T
)
df_properties <-
data$properties %>%
as_tibble()
df_names <-
dictionary_realtor_names()
actual_names <-
names(df_properties) %>%
map_chr(function(name) {
df_row <- df_names %>% filter(nameRealtor == name)
if (df_row %>% nrow() == 0) {
glue("Missing {name}") %>%
cat(fill = T)
return(name)
}
df_row %>% pull(nameActual)
})
df_properties <-
df_properties %>%
set_names(actual_names) %>%
dplyr::select(-dplyr::matches("remove")) %>%
suppressMessages()
df_properties <-
df_properties %>%
.munge_realtor() %>%
mutate(urlAPI = url)
df_properties
}
property_near <-
function(location = "5836 Mossrock Drive, North Bethesda, MD",
return_message = T) {
df_loc <-
geocode(locations = location,
return_message = return_message) %>%
filter(!is.na(latitudeLocation)) %>%
dplyr::slice(1) %>%
remove_na()
url <-
.generate_near_url(latitude = df_loc$latitudeLocation,
longitude = df_loc$longitudeLocation)
df_loc <-
df_loc %>%
mutate(urlAPI = url)
data <-
.parse_realtor_api_near_url(url = url)
data <-
data %>%
left_join(df_loc, by = "urlAPI") %>%
select(one_of(names(df_loc)), everything()) %>%
select(-urlAPI) %>%
suppressMessages() %>%
.munge_realtor()
data$urlListing <-
data$urlListing %>% str_replace_all("https://www.realtor.com//", "https://www.realtor.com/")
data
}
#' Properties near a location
#'
#' This function returns 50
#' properties near a specified location.
#'
#' The location can be an exact address, zipcode
#' city or a neighborhood.
#'
#' @param locations a vector of locations
#' @param return_message if \code{TRUE} returns a message
#'
#' @return
#' @export
#'
#' @examples
#' locations <- c("2449 Tracy Place, NW, Washington DC", "Sunset Island, Miami Beach, FL" )
#' properties_near(locations = locations)
properties_near <-
function(locations = NULL,
return_message = TRUE) {
if (locations %>% is_null()) {
stop("Enter locations")
}
property_near_safe <-
possibly(property_near, tibble())
all_data <-
locations %>%
map_dfr(function(location){
property_near_safe(location = location,
return_message = return_message)
})
all_data
}
# seach_results -----------------------------------------------------------
.calculate_pages <- function(page) {
page %>%
html_nodes('.pagination') %>%
html_text() %>%
str_split("\n") %>%
flatten_chr() %>%
str_trim() %>%
as.character() %>%
parse_number() %>%
max(na.rm = T) %>%
suppressWarnings() %>% suppressMessages()
}
.generate_search_urls <-
function(url = "https://www.realtor.com/realestateandhomes-search/Bethesda_MD",
radius = NULL) {
page <- url %>% read_html()
pages <- page %>% .calculate_pages()
1:pages %>%
map_dfr(function(x) {
if (!radius %>% is_null()) {
radius_slug <-
glue('/radius-{radius}') %>% as.character()
hasRadius <- T
} else {
radius_slug <- ''
hasRadius <- F
}
tibble(
idPage = x,
hasRadius,
urlListingPage = glue("{url}/pg-{x}{radius_slug}") %>% as.character()
)
})
}
.parse_search_page <-
function(url = "https://www.realtor.com/realestateandhomes-search/Bethesda_MD/pg-3") {
page <-
.curl_page(url = url)
result_nodes <-
page %>%
html_nodes('.js-save-alert-wrapper')
parse_css_name_safe <-
possibly(parse_css_name, tibble())
all_data <-
seq_along(result_nodes) %>%
map_dfr(function(x) {
page_node <-
result_nodes[[x]]
df <-
list(
parse_css_name_safe(
page = page_node,
css = ".data-price-display",
actual_name = "priceDisplay",
is_numeric = F
),
parse_css_name_safe(
page = page_node,
css = ".label-green",
actual_name = "typeListing"
),
parse_css_name_safe(
page = page_node,
css = ".srp-property-type",
actual_name = "typeProperty"
) %>% mutate(value = value %>% str_replace_all("for Sale", "") %>% str_trim()),
#
parse_css_name_safe(
page = page_node,
css = ".font-bold",
actual_name = "nameBrokerage"
),
parse_css_name_safe(
page = page_node,
css = ".meta-beds",
actual_name = "countBeds",
is_numeric = F
),
parse_css_name_safe(
page = page_node,
css = "li:nth-child(2) .data-value",
actual_name = "countBaths",
is_numeric = F
),
parse_css_name_safe(
page = page_node,
css = "li:nth-child(3) .data-value",
actual_name = "areaPropertySF",
is_numeric = F
),
parse_css_name_safe(
page = page_node,
css = "li:nth-child(3) .data-value",
actual_name = "areaLot",
is_numeric = F
)
) %>%
reduce(bind_rows) %>%
mutate(value = value %>% str_to_title()) %>%
spread(nameActual, value)
property_address <-
page_node %>% html_nodes('.js-srp-listing-photos') %>% html_attr("alt") %>%
.[[1]]
url_image <-
page_node %>% html_nodes('.js-srp-listing-photos') %>% html_attr('src') %>% .[[1]]
url_listing <-
page_node %>%
html_nodes('.srp-item-photo a') %>%
html_attr('href') %>%
str_c("https://www.realtor.com", .)
df <-
df %>%
.munge_realtor() %>%
suppressMessages() %>%
suppressWarnings()
df %>%
mutate(
numberListingPage = x,
addressProperty = property_address,
urlListing = url_listing,
urlImage = url_image
) %>%
select(numberListingPage, addressProperty, everything())
}) %>%
mutate(urlListingPage = url)
parse_address_safe <-
possibly(parse_address, tibble())
df_address <-
all_data$addressProperty %>%
map_dfr(function(address) {
parse_address_safe(address = address)
})
all_data <-
all_data %>%
left_join(df_address) %>%
select(numberListingPage,
one_of(names(df_address)),
everything()) %>%
suppressMessages()
all_data
}
.parse_search_pages <-
function(urls = "https://www.realtor.com/realestateandhomes-search/Bethesda_MD/pg-10",
return_message = TRUE) {
.parse_search_page_safe <-
possibly(.parse_search_page, tibble())
urls %>%
map_dfr(function(url) {
if (return_message) {
glue("Parsing {url %>% str_replace_all('https://www.realtor.com/', '')}") %>%
message()
}
.parse_search_page_safe(url = url)
})
}
location_listings <-
function(location_name = c("Bethesda, MD"),
include_features = F,
radius = NULL,
parse_property_details = F,
return_message = T) {
df_val <- validate_locations(locations = location_name)
df_urls <-
.generate_search_urls(url = df_val$urlListing, radius = radius)
df_urls$urlListingPage[[1]] <-
df_urls$urlListingPage[[1]] %>% str_replace_all("\\/pg-1", "")
all_data <-
.parse_search_pages(urls = df_urls$urlListingPage)
all_data <-
all_data %>%
left_join(df_urls) %>%
select(-one_of("hasRadius")) %>%
select(idPage, everything()) %>%
suppressMessages() %>%
arrange(idPage)
if (parse_property_details) {
"Parsing indvidual property listings" %>% cat(fill = T)
parse_listing_urls_safe <-
possibly(parse_listing_urls, tibble())
urls <-
all_data$urlListing
all_listing <-
parse_listing_urls_safe(
urls = urls,
include_features = include_features,
return_message = return_message
)
all_data <-
all_data %>%
left_join(all_listing) %>%
suppressMessages()
}
all_data %>%
.munge_realtor() %>%
remove_columns()
}
#' Property listing table
#'
#' Returns listing table. Slower and not advised
#' unless \link{listings()} does not work for your search.
#'
#' @param locations vector of table listings
#' @param include_features if \code{TRUE} includes property features from pages
#' @param radius if not \code{NULL} additional search radius
#' @param parse_property_details if \code{TRUE}
#' @param return_message if \code{TRUE} returns messages
#' @param ... extra parameters
#'
#' @return a \code{tibble}
#' @export
#'
#' @examples
#' table_listings(locations = "Marietta, GA")
table_listings <-
function(locations,
include_features = F,
radius = NULL,
parse_property_details = F,
return_message = T,
...) {
location_listings_safe <-
possibly(location_listings, tibble())
all_data <-
locations %>%
map_dfr(function(location) {
location_listings_safe(
location_name = location,
include_features = include_features,
radius = radius,
parse_property_details = parse_property_details,
return_message = return_message,
...
)
})
all_data
}
# listings ----------------------------------------------------------------
.parse_listing_url <-
function(url = "https://www.realtor.com/realestateandhomes-detail/5301-Westbard-Cir-Apt-323_Bethesda_MD_20816_M63437-59115",
include_features = T,
sleep_time = NULL) {
options(warn = 0)
page <-
read_html(x = url)
# .curl_page(url = url)
fact_nodes <-
page %>%
html_nodes(".ldp-key-fact-item")
data <-
seq_along(fact_nodes) %>%
map_dfr(function(x) {
fact_node <-
fact_nodes[[x]]
id <-
fact_node %>% html_node('i') %>%
html_attr('class')
value <-
fact_node %>%
html_node('.ellipsis') %>%
html_attr('title')
if (value %>% is.na()) {
div_text <-
fact_node %>% html_nodes("div") %>% html_text()
if (x %in% c(3, 5)) {
value <-
parse_number(as.character(div_text[[2]])) %>%
suppressWarnings()
}
}
tibble(numberItem = x,
id,
value = value %>% as.character())
})
data <-
data %>%
left_join(dictionary_css_page(), by = "id") %>%
select(-c(numberItem, id)) %>%
spread(nameActual, value) %>%
.munge_realtor()
photo_urls <-
page %>% html_nodes('#ldpPhotoshero img') %>% html_attr('data-src')
if (photo_urls %>% length() > 0) {
photo_urls <- photo_urls[!photo_urls %>% is.na()]
photo_urls <-
photo_urls[!photo_urls %>% str_detect("w60_h60")]
if (photo_urls %>% length() > 0) {
data <-
data %>%
mutate(
urlImage = photo_urls %>% sample(1),
dataPhotos = list(tibble(urlPhotoProperty = photo_urls)),
countPhotos = dataPhotos %>% map_dbl(nrow)
)
}
}
address_nodes <-
page %>%
html_nodes('#ldp-address span')
if (address_nodes %>% length() > 0) {
df_address <-
seq_along(address_nodes) %>%
map_dfr(function(x) {
attributes <-
address_nodes[[x]] %>% html_attrs()
value <-
address_nodes[x] %>% html_text() %>% as.character() %>% str_trim()
df <-
tibble(item = attributes %>% names(),
id = attributes %>% as.character(),
value) %>%
mutate(idRow = x)
df
}) %>%
filter(!item == 'class') %>%
filter(!value == '') %>%
left_join(dictionary_css_page(), by = "id") %>%
mutate(value = value %>% str_replace_all("\\,", "")) %>%
select(nameActual, value) %>%
spread(nameActual, value)
data <-
data %>%
mutate(id = 1) %>%
left_join(df_address %>% mutate(id = 1), by = "id") %>%
select(-id)
}
other_agent <- page %>% html_nodes('#ldp-branding .font-bold')
if (other_agent %>% length() > 0) {
df_broker_1 <-
seq_along(other_agent) %>%
map_dfr(function(x) {
attributes <-
other_agent[[x]] %>% html_attrs()
value <-
other_agent[x] %>% html_text() %>% as.character()
df <-
tibble(item = attributes %>% names(),
id = attributes %>% as.character(),
value) %>%
mutate(idRow = x) %>%
filter(item == 'data-label')
df
}) %>%
left_join(dictionary_css_page()) %>%
select(nameActual, value) %>%
distinct() %>%
spread(nameActual, value) %>%
.munge_realtor() %>%
suppressMessages()
data <-
data %>%
mutate(id = 1) %>%
left_join(df_broker_1 %>% mutate(id = 1)) %>%
select(-id) %>%
suppressMessages()
}
listing <-
page %>% html_nodes('#ldp-pricewrap span') %>% html_text()
if (listing %>% length() > 0) {
listing <- listing %>% as.character() %>% parse_number() %>%
suppressWarnings() %>% suppressMessages()
listing <- listing[!listing %>% is.na()]
data <-
data %>%
mutate(priceListing = listing[[1]])
}
hood_detail <-
page %>% html_nodes("#ldp-neighborhood-section .padding-bottom")
if (hood_detail %>% length() > 0) {
hood_details <-
hood_detail %>%
html_nodes('a') %>% html_text() %>% str_trim()
urls <-
hood_detail %>%
html_nodes('a') %>% html_attr('href') %>%
str_c("https://www.realtor.com/", .)
hood_stats <-
page %>%
html_nodes('.neighborhood-local-item p') %>%
html_text() %>%
as.character() %>%
parse_number() %>%
suppressMessages() %>%
suppressWarnings()
if (length(hood_details) >= 2 && length(hood_stats) == 4) {
df_hood <-
tibble(
item = c(
"priceListingMedianNeighborhood",
"priceSaleMedianNeighborHood",
"countDaysOnMarketMedianNeighborhood",
"pricePerSFSaleMedianNeighborhood"
),
value = hood_stats
) %>%
spread(item, value) %>%
mutate(
nameNeighborhood = hood_details[[1]],
nameCity = hood_details[[2]],
urlNeighborhood = urls[[1]],
urlCity = urls[[2]]
) %>%
select(nameCity, nameNeighborhood, everything())
data <-
data %>%
mutate(dataNeighborhood = list(df_hood))
}
if (length(hood_details) >= 2 && length(hood_stats) == 2) {
df_hood <-
tibble(
item = c(
"priceListingMedianNeighborhood",
"pricePerSFSaleMedianNeighborhood"
),
value = hood_stats
) %>%
spread(item, value) %>%
mutate(
nameNeighborhood = hood_details[[1]],
nameCity = hood_details[[2]],
urlNeighborhood = urls[[1]],
urlCity = urls[[2]]
) %>%
select(nameCity, nameNeighborhood, everything())
data <-
data %>%
mutate(dataNeighborhood = list(df_hood))
}
}
hood <- page %>% html_nodes('#local-name') %>% html_text()
if (hood %>% length() > 0) {
data <-
data %>%
mutate(nameNeighborhoodProperty = hood)
}
descriptionText <-
page %>% html_nodes("#ldp-detail-romance") %>% html_text()
feature_nodes <-
page %>% html_nodes('.title-subsection-sm+ .row li')
if (descriptionText %>% length() > 0) {
data <-
data %>%
mutate(descriptionText = descriptionText %>% str_to_title())
}
broker_name_contact <-
page %>% html_nodes('.ellipsis.link-secondary span')
if (broker_name_contact %>% length() > 0) {
values <-
broker_name_contact %>% html_text() %>% str_trim()
values <- values[!values %>% str_to_lower() == "by"]
df_agent <-
seq_along(broker_name_contact) %>%
map_dfr(function(x) {
attributes <- broker_name_contact[[x]] %>% html_attrs()
df <-
tibble(item = attributes %>% names(),
id = attributes %>% as.character()) %>%
mutate(idRow = x)
if (nrow(df) >= 2) {
df <-
df %>%
filter(item == "data-label")
return(df)
}
df %>%
filter(item == "class")
}) %>%
mutate(value = values) %>%
left_join(dictionary_css_page()) %>%
select(nameActual, value) %>%
filter(!is.na(nameActual)) %>%
spread(nameActual, value) %>%
suppressMessages()
data <-
data %>%
mutate(id = 1) %>%
left_join(df_agent %>% mutate(id = 1)) %>%
select(-id) %>%
suppressMessages()
}
brokerage_name_contact <-
page %>% html_nodes('.link-secondary .ellipsis span')
if (brokerage_name_contact %>% length() > 0) {
values <-
brokerage_name_contact %>% html_text() %>% str_trim()
df_broker <-
seq_along(brokerage_name_contact) %>%
map_dfr(function(x) {
attributes <-
brokerage_name_contact[[x]] %>% html_attrs()
df <-
tibble(item = attributes %>% names(),
id = attributes %>% as.character()) %>%
mutate(idRow = x)
df <-
df %>%
filter(item == "data-label")
df
}) %>%
mutate(value = values) %>%
left_join(dictionary_css_page()) %>%
select(nameActual, value) %>%
filter(!is.na(nameActual)) %>%
spread(nameActual, value) %>%
suppressMessages()
data <-
data %>%
mutate(id = 1) %>%
left_join(df_broker %>% mutate(id = 1)) %>%
select(-id) %>%
suppressMessages()
}
if (feature_nodes %>% length() > 0 &&
include_features) {
features <- feature_nodes %>% html_text()
df_features <- tibble(descriptionFeature = features)
data <-
data %>%
mutate(dataFeatures = list(df_features))
}
url_vr <-
page %>% html_nodes('#hero-view-tour') %>% html_attr('href')
if (url_vr %>% length() > 0) {
data <-
data %>%
mutate(urlVRTour = url_vr)
}
property_meta <-
page %>%
html_nodes('#ldp-property-meta li')
if (property_meta %>% length() > 0) {
df_property_df <-
seq_along(property_meta) %>%
map_dfr(function(x) {
attributes <-
property_meta[[x]] %>% html_attrs()
value <-
property_meta[x] %>% html_text() %>% as.character() %>% parse_number() %>%
suppressWarnings() %>% suppressMessages() %>%
as.character()
df <-
tibble(item = attributes %>% names(),
id = attributes %>% as.character(),
value) %>%
mutate(idRow = x)
df
}) %>%
left_join(dictionary_css_page()) %>%
select(nameActual, value) %>%
distinct() %>%
spread(nameActual, value) %>%
.munge_realtor() %>%
suppressMessages()
data <-
data %>%
mutate(id = 1) %>%
left_join(df_property_df %>% mutate(id = 1)) %>%
select(-id) %>%
suppressMessages()
}
url3d <-
page %>% html_nodes('.ldp-matterport-label .margin-right-sm') %>% html_attr('href')
if (url3d %>% length() > 0) {
data <-
data %>%
mutate(url3DTour = url3d)
}
listing_history <-
page %>% html_nodes("#ldp-history-price td") %>% html_text()
if (listing_history %>% length() > 0) {
times <- length(listing_history) %/% 5
items <-
rep(
c(
"dateEvent",
"descriptionEvent",
"priceEvent",
"amountPSFEvent",
"sourceEvent"
),
times
)
df_listings <-
tibble(item = items, value = listing_history) %>%
group_by(item) %>%
mutate(idEvent = 1:n()) %>%
ungroup() %>%
spread(item, value) %>%
mutate(
dateEvent = dateEvent %>% lubridate::mdy(),
dateEvent = case_when(descriptionEvent == "Estimated" ~ Sys.Date(),
TRUE ~ dateEvent),
priceEvent = priceEvent %>% as.character() %>% parse_number(),
sourceEvent = case_when(sourceEvent == "" ~ NA_character_, TRUE ~ sourceEvent),
amountPSFEvent = amountPSFEvent %>% parse_number()
) %>%
arrange((dateEvent)) %>%
mutate(numberListing = 1:n()) %>%
select(-idEvent) %>%
select(numberListing, one_of(c(
"dateEvent",
"descriptionEvent",
"priceEvent"
)),
everything()) %>%
arrange(desc(dateEvent)) %>%
suppressWarnings()
data <-
data %>%
mutate(dataListingHistory = list(df_listings)) %>%
mutate(countListings = dataListingHistory %>% map_dbl(nrow))
}
comps <-
page %>% html_nodes('.col-xxs-3') %>% html_text() %>% str_trim()
if (length(comps) > 0) {
if (length(comps) > 20) {
homes <-
page %>% html_nodes("#ldp-home-values .ellipsis") %>% html_text() %>% str_trim() %>% str_split("\\:|\\,")
addresses <-
seq_along(homes) %>%
map_chr(function(x) {
homes[[x]] %>% str_trim() %>% str_split("\\ ") %>% flatten_chr() %>% discard(~
.x == "") %>%
str_c(collapse = " ") %>% str_replace_all("\\This Home ", "")
})
price_estimate <-
page %>% html_nodes('.col-xxs-3') %>% html_text()
price_estimate <-
price_estimate %>% as.character() %>%
str_remove_all("Est. ") %>%
parse_number() %>%
suppressWarnings() %>%
discard(function(x) {
is.na(x)
})
area_sf <-
page %>% html_nodes('.col-sm-1') %>% html_text() %>% as.character() %>% parse_number() %>%
suppressWarnings() %>% suppressMessages()
df_comps <-
tibble(addresseComp = addresses,
priceEstimate = price_estimate)
if (area_sf %>% length() - 1 == nrow(df_comps)) {
area_sf <-
area_sf[2:length(area_sf)] %>% as.character() %>% parse_number() %>%
suppressWarnings() %>% suppressMessages()
df_comps <-
df_comps %>%
mutate(areaPropertySF = area_sf)
}
if ((area_sf %>% length() - 3) %/% 3 == nrow(df_comps)) {
area_sf <-
area_sf[4:length(area_sf)]
items <-
rep(c("countBeds", "countBathrooms", "areaPropertySF"),
times = nrow(df_comps))
items <-
rep(c("countBeds", "countBathrooms", "areaPropertySF"),
times = nrow(df_comps))
df_bed_bath <-
tibble(
idItem = rep(1:3, times = nrow(df_comps)),
item = items[seq_along(area_sf)],
value = area_sf
) %>%
group_by(item) %>%
mutate(numberProperty = 1:n()) %>%
select(numberProperty, item, value) %>%
ungroup() %>%
mutate(value = value %>% as.character() %>% parse_number()) %>%
suppressWarnings() %>% suppressMessages() %>%
spread(item, value)
df_comps <-
df_comps %>%
mutate(numberProperty = 1:n()) %>%
left_join(df_bed_bath) %>%
suppressMessages() %>%
select(-numberProperty)
}
links <-
page %>% html_nodes("#ldp-home-values a") %>% html_attr('href')
links <-
links[!links %>% is.na()] %>% str_c("https://www.realtor.com/realestateandhomes-detail", .)
listing_comps <- c(url, links)
if (length(listing_comps) == nrow(df_comps)) {
df_comps <-
df_comps %>%
mutate(urlListingComp = listing_comps)
}
lot_nodes <-
page %>% html_nodes(".hidden-xxs.col-sm-2") %>% html_text()
if (length(lot_nodes) - 1 == nrow(df_comps)) {
lots <-
lot_nodes[2:length(lot_nodes)] %>% as.character() %>% parse_number() %>%
suppressWarnings() %>% suppressMessages()
df_comps <-
df_comps %>%
mutate(areaLotSF = lots)
}
data <-
data %>%
mutate(dataComps = list(df_comps),
countComps = dataComps %>% map_dbl(nrow))
}
}
taxes <-
page %>%
html_nodes("#ldp-history-taxes td") %>% html_text() %>% as.character() %>%
suppressWarnings() %>% suppressMessages()
if (length(taxes) > 0) {
tax_data <-
page %>%
html_nodes("#ldp-history-taxes td") %>% html_text() %>% as.character()
times <-
length(tax_data) %/% 6
items <-
rep(c("yearTaxes", "amountTaxes", "amountValueLand", "symbol1", "amountValueAdditions", "symbol2", "amountTaxableValue"), times)
if (length(items) > length(tax_data)) {
items <- items[1:length(tax_data)]
}
df_taxes <-
tibble(item = items, value = tax_data) %>%
group_by(item) %>%
mutate(idEvent = 1:n()) %>%
ungroup() %>%
spread(item, value) %>%
select(-matches("symbol")) %>%
select(yearTaxes, everything()) %>%
mutate_all(function(x) {
x %>% as.character() %>% parse_number()
}) %>%
suppressMessages()
data <-
data %>%
mutate(dataTaxes = list(df_taxes))
}
schools <-
page %>% html_nodes('#load-more-schools td') %>% html_text() %>% str_trim()
if (schools %>% length() > 0) {
value <-
page %>% html_nodes('#load-more-schools td') %>% html_text() %>% str_trim()
repeats <- length(value) %/% 4
item <-
rep(c("ratingSchool", "nameSchool", "gradesSchool","distanceSchoolMiles"), repeats)
df_school <-
tibble(item, value) %>%
group_by(item) %>%
mutate(numberSchool = 1:n()) %>%
ungroup() %>%
spread(item, value) %>%
mutate_at(c("distanceSchoolMiles", "ratingSchool"),
list(function(x){
x %>% parse_number()
})) %>%
suppressWarnings()
urlSchool <-
page %>%
html_nodes('#load-more-schools .text-align-center+ td a') %>%
html_attr('href') %>%
str_c("https://www.realtor.com/local/schools/montauk-school_0751452801",
.)
df_school <-
df_school %>%
mutate(urlSchool)
data <-
data %>%
mutate(dataSchool = list(df_school))
}
data <-
data %>%
mutate(urlListing = url)
if (!sleep_time %>% is_null()) {
Sys.sleep(time = sleep_time)
}
data
}
#' Parse html listings
#'
#' Parse vector of listing html listing
#' urls.
#'
#' This function will likely result in DDOS so use carefully.
#' When possible find the JSON api data for the listings.
#'
#' @param urls vector of html listings
#' @param include_features if \code{TRUE} includes features
#' @param sleep_time if not \code{NULL} sleep time between scrapes
#' @param return_message if \code{TRUE} returns message
#'
#' @return
#' @export
#'
#' @examples
#' html_listing_urls(urls = "https://www.realtor.com/realestateandhomes-detail/5301-Westbard-Cir-Apt-323_Bethesda_MD_20816_M63437-59115")
html_listing_urls <-
function(urls = NULL,
include_features = F,
sleep_time = 1,
return_message = TRUE) {
.parse_listing_url_safe <-
possibly(.parse_listing_url, tibble())
all_data <-
urls %>%
map_dfr(function(url) {
if (return_message) {
glue("Parsing {url %>% str_replace_all('https://www.realtor.com/', '')}") %>%
message()
}
.parse_listing_url_safe(url = url,
include_features = include_features,
sleep_time = sleep_time) %>%
suppressWarnings()
})
all_data <-
all_data %>%
.munge_realtor() %>%
mutate(dateData = Sys.Date()) %>%
select(dateData, everything())
if (all_data %>% has_name("dataComps")) {
all_data <-
all_data %>%
mutate(hasComps = dataComps %>% map_dbl(length) > 0)
}
if (all_data %>% has_name("dataTaxes")) {
all_data <-
all_data %>%
mutate(hasTaxes = dataTaxes %>% map_dbl(length) > 0)
}
if (all_data %>% has_name("dataPhotos")) {
all_data <-
all_data %>%
mutate(hasPhotos = dataPhotos %>% map_dbl(length) > 0)
}
if (all_data %>% has_name("dataSchool")) {
all_data <-
all_data %>%
mutate(hasSchools = dataSchool %>% map_dbl(length) > 0)
}
if (all_data %>% has_name("dataNeighborhood")) {
all_data <-
all_data %>%
mutate(hasNeighborhood = dataNeighborhood %>% map_dbl(length) > 0)
}
if (all_data %>% has_name("dataListingHistory")) {
all_data <-
all_data %>%
mutate(hasListingHistory = dataListingHistory %>% map_dbl(length) > 0)
}
if (all_data %>% has_name("nameAgent")) {
all_data <-
all_data %>%
mutate(nameAgent = nameAgent %>% str_replace_all("\\, Agent|\\, Broker", "") %>% str_trim() %>%
str_to_upper())
}
if (all_data %>% has_name("statusListing")) {
all_data <-
all_data %>%
mutate(statusListingDetail = statusListing)
}
all_data
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.