The package at present scrapes and extracts information about real estate adds from a French website.

After installation, the package can be attached in the regular way

library(estate)

Create URLs

The website allows displaying at maximum 40 ads per page. The URLs of the first 21 pages are irregular:

createUrl(page = 1, provider = "papfr", type = "vente")
createUrl(page = 2, provider = "papfr", type = "vente")
createUrl(page = 22, provider = "papfr", type = "vente")

Extract from HTML files

The various low-level extraction functions make use of the XML package to access nodes in the HTML page structure (elements like <div>, <p>).

url <- system.file("extdata", "papfr-40.html", package = "estate")
doc <- XML::htmlParse(url, encoding = "utf-8")
nobs <- 2
estate:::extractLinkVignette(doc)[1:nobs]
estate:::extractLinkPhoto(doc)[1:nobs]
(description <- estate:::extractDescription(doc)[1:nobs])
estate:::extractCP(description)
estate:::extractSummaryTable(doc)[1:nobs,]
estate:::extractPrice(doc)[1:nobs]
estate:::extractDate(doc)[1:nobs]
estate_table <- extractList(url)
knitr::kable(estate_table[1:nobs, ])

Pooled download HTML files

In order to achieve good download performance, the curl package is used. This package is designed to make parallel calls to the same host. This way, we can retrieve the maximum number of 50 HTML pages efficiently. The download location "../inst/extdata/vente" assumes you have extracted the package source and attempt to build the vignette using devtools::build_vignettes(). Create the subfolder "vente" in case it does not exist. In order to make the files visible to the system.file function, a re-install is required devtools::install().

type <- "vente"
## type <- "location"
htmldir <- system.file("extdata", type, package = "estate")
npages <- 50
## set eval=TRUE to activate download
## htmldir_temp <- file.path("../inst/extdata", type)
## setwd("vignettes")
htmldir_temp <- file.path("../inst/extdata", type) # run from /docs/articles folder
unlink(file.path(htmldir_temp, list.files(htmldir_temp)))
downloadHtml(type = type, pages = c(1:npages), htmldir = htmldir_temp)
devtools::install() # reads data from install dir

Vectorized extraction from HTML

combineEstate <- function(type, htmldir, pages=1) {
  htmlfiles <- list.files(htmldir)[1:pages]
  estate_list <-
    lapply(file.path(htmldir, htmlfiles), extractList)
  estate_df <- do.call("rbind", estate_list)
  return(estate_df)
}

estatedf <- combineEstate(type = type,
                          htmldir = htmldir,
                          pages = npages)
datenow <- format(Sys.Date(), format = "%Y-%m-%d")
## datenow <- "2017-01-30"
exportfile <- file.path("/tmp", paste0(datenow, "-estate-", type, ".tsv"))
write.table(x = estatedf, file = exportfile, row.names = FALSE, sep = "\t")
if (type=="vente") {
  price_min <-  2*10^5; price_max <- 5*10^5
} else if (type=="location") {
  price_min <-  9*10^2; price_max <- 16*10^2
}
estatedf <- subset(estatedf, price < price_max & price > price_min)
## estatedf <- subset(estatedf, price < price_max & price > price_min & date > "2017-09-09")
nrow(estatedf)

estatedf$price_per_sqm <- estatedf$price / estatedf$size

intcols <- names(estatedf)[lapply(estatedf, class)%in%c("integer", "numeric")]
summary(estatedf[,colnames(estatedf) %in% intcols])
library(ggplot2)
library(ggiraph)

# create an 'onclick' column
estatedf$onclick <- sprintf("window.open(\"%s\")", estatedf$link)
estatedf$tooltip <- sprintf("<img src=\"%s\"/>", estatedf$photo)

gg_base <-
  ggplot(estatedf, aes( x = price, y = size, color = factor(location)) ) +
  ## scale_colour_hue(h = c(0, 90)) +
  theme_minimal()

gg_interactive <-
  gg_base +
  ## geom_smooth(method = "lm") +
  geom_point_interactive(aes(tooltip = tooltip, onclick = onclick), size = 2)

ggiraph(code = print(gg_interactive), width = 1, width_svg = 7) #, zoom_max = 5)
dataset = iris
dataset$tooltip = dataset$Species
dataset$clickjs = paste0("alert(\"",dataset$Species, "\")" )

                                        # plots
gg_point = ggplot(dataset,
                  aes(x = Sepal.Length, y = Petal.Width,
                      color = Species, tooltip = tooltip,
                      onclick = clickjs) ) +
  geom_point_interactive()

ggiraph(code = {print(gg_point)})
gg_jitter <-
  gg_base +
  geom_point() + geom_jitter(height = 0.5, width = 500)
gg_jitter


bowerth/estate documentation built on May 13, 2019, 12:37 a.m.