The package at present scrapes and extracts information about real estate adds from a French website.
After installation, the package can be attached in the regular way
library(estate)
The website allows displaying at maximum 40 ads per page. The URLs of the first 21 pages are irregular:
page = 1
: the page number does not appear at the end of the URL.page < 22
: variable component after "annonce/".page > 1
: page number in URL.createUrl(page = 1, provider = "papfr", type = "vente") createUrl(page = 2, provider = "papfr", type = "vente") createUrl(page = 22, provider = "papfr", type = "vente")
The various low-level extraction functions make use of the XML
package to access nodes in the HTML page structure (elements like <div>
, <p>
).
url <- system.file("extdata", "papfr-40.html", package = "estate") doc <- XML::htmlParse(url, encoding = "utf-8") nobs <- 2 estate:::extractLinkVignette(doc)[1:nobs] estate:::extractLinkPhoto(doc)[1:nobs] (description <- estate:::extractDescription(doc)[1:nobs]) estate:::extractCP(description) estate:::extractSummaryTable(doc)[1:nobs,] estate:::extractPrice(doc)[1:nobs] estate:::extractDate(doc)[1:nobs]
estate_table <- extractList(url) knitr::kable(estate_table[1:nobs, ])
In order to achieve good download performance, the curl
package is used. This package is designed to make parallel calls to the same host. This way, we can retrieve the maximum number of 50 HTML pages efficiently. The download location "../inst/extdata/vente" assumes you have extracted the package source and attempt to build the vignette using devtools::build_vignettes()
. Create the subfolder "vente" in case it does not exist. In order to make the files visible to the system.file
function, a re-install is required devtools::install()
.
type <- "vente" ## type <- "location" htmldir <- system.file("extdata", type, package = "estate") npages <- 50
## set eval=TRUE to activate download ## htmldir_temp <- file.path("../inst/extdata", type) ## setwd("vignettes") htmldir_temp <- file.path("../inst/extdata", type) # run from /docs/articles folder unlink(file.path(htmldir_temp, list.files(htmldir_temp))) downloadHtml(type = type, pages = c(1:npages), htmldir = htmldir_temp) devtools::install() # reads data from install dir
combineEstate <- function(type, htmldir, pages=1) { htmlfiles <- list.files(htmldir)[1:pages] estate_list <- lapply(file.path(htmldir, htmlfiles), extractList) estate_df <- do.call("rbind", estate_list) return(estate_df) } estatedf <- combineEstate(type = type, htmldir = htmldir, pages = npages)
datenow <- format(Sys.Date(), format = "%Y-%m-%d") ## datenow <- "2017-01-30" exportfile <- file.path("/tmp", paste0(datenow, "-estate-", type, ".tsv")) write.table(x = estatedf, file = exportfile, row.names = FALSE, sep = "\t")
if (type=="vente") { price_min <- 2*10^5; price_max <- 5*10^5 } else if (type=="location") { price_min <- 9*10^2; price_max <- 16*10^2 } estatedf <- subset(estatedf, price < price_max & price > price_min) ## estatedf <- subset(estatedf, price < price_max & price > price_min & date > "2017-09-09") nrow(estatedf) estatedf$price_per_sqm <- estatedf$price / estatedf$size intcols <- names(estatedf)[lapply(estatedf, class)%in%c("integer", "numeric")] summary(estatedf[,colnames(estatedf) %in% intcols])
library(ggplot2) library(ggiraph) # create an 'onclick' column estatedf$onclick <- sprintf("window.open(\"%s\")", estatedf$link) estatedf$tooltip <- sprintf("<img src=\"%s\"/>", estatedf$photo) gg_base <- ggplot(estatedf, aes( x = price, y = size, color = factor(location)) ) + ## scale_colour_hue(h = c(0, 90)) + theme_minimal() gg_interactive <- gg_base + ## geom_smooth(method = "lm") + geom_point_interactive(aes(tooltip = tooltip, onclick = onclick), size = 2) ggiraph(code = print(gg_interactive), width = 1, width_svg = 7) #, zoom_max = 5)
dataset = iris dataset$tooltip = dataset$Species dataset$clickjs = paste0("alert(\"",dataset$Species, "\")" ) # plots gg_point = ggplot(dataset, aes(x = Sepal.Length, y = Petal.Width, color = Species, tooltip = tooltip, onclick = clickjs) ) + geom_point_interactive() ggiraph(code = {print(gg_point)})
gg_jitter <- gg_base + geom_point() + geom_jitter(height = 0.5, width = 500) gg_jitter
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.