library(RCurl)
library(rvest)
library(magrittr)
library(curl)
library(estate)
library(doMC)
registerDoMC(detectCores(logical = TRUE) - 1)
library(ggplot2)
library(dplyr)

htmldir <- "inst/extdata/enalquiler"
## get number of offers
num_per_page <- 15
baseurl <- "http://www.enalquiler.com/search?provincia=43&poblacion=46424&tipo=2"
tt <- getURL(baseurl)
doc <- read_html(tt)

property_num <-
  doc %>% html_node(".property-list-title-count") %>% html_text() %>%
  sub("[ a-z]+", "", .) %>%
  gsub("[()]", "", .) %>%
  as.integer()

npages <- ceiling(property_num / num_per_page)

estate::downloadHtml(pages = seq(1, npages),
             htmldir = htmldir,
             provider = "enalquiler")
extractEnalquiler <- function(item) {

  title <-
    item %>% html_nodes(".property-title") %>% html_text() %>%
    gsub("\n", "", .) %>%               # remove line breaks
    sub("^[ ]+", "", .) %>%             # remove leading whitespace
    sub("[ ]+$", "", .)                 # remove trailing  whitespace

  link <-
    item %>% html_nodes(".property-title") %>% html_attr("href")

  price <-
    item %>% html_nodes(".property-price") %>% html_text() %>%
    gsub("\n", "", .) %>%               # remove line breaks
    sub("[ ]+", "", .) %>%              # remove whitespace
    sub("\u0080", "", .) %>%            # remove euro symbol
    sub("[.]", "", .)                   # remove thousands seperator

  size <-
    item %>% html_nodes(".detail-m2") %>% html_text() %>%
    gsub("\n", "", .) %>%               # remove line breaks
    sub("^[ ]+", "", .) %>%             # remove leading whitespace
    sub("m2", "", .)

  bath <-
    item %>% html_nodes(".detail-bathrooms") %>% html_text() %>%
    gsub("\n", "", .) %>%               # remove line breaks
    sub("^[ ]+", "", .) %>%             # remove leading whitespace
    sub("[a-zA-Zñ]+", "", .) %>%        # remove "Baño(s)"
    sub("[ ]+$", "", .)                 # remove trailing  whitespace

  location <-
    item %>% html_nodes(".property-location") %>% html_text() %>%
    gsub("\n", "", .) %>%               # remove line breaks
    sub("^[ ]+", "", .) %>%             # remove leading whitespace
    sub("[ ]+$", "", .)                 # remove trailing  whitespace

  list_out <-
    list(price = as.integer(price),
         size = as.integer(size),
         bath = as.integer(bath),
         title = title,
         location = location,
         link = link)
  return(list_out)
}
processEnalquiler <- function(url) {
  ## doc <- XML::htmlParse(url, encoding = "utf-8")
  doc <- xml2::read_html(url)
  proplist <- doc %>% html_nodes(".property-list") %>% html_nodes(".property-item")
  res_ls <- lapply(proplist, extractEnalquiler)
  res_df <- do.call(rbind.data.frame, res_ls)
  return(res_df)
}
## processEnalquiler("inst/extdata/enalquiler/file21d6ab837ba.html")

Vectorized extraction from HTML

combineEstate <- function(htmldir, pages=1) {
  htmlfiles <- list.files(htmldir)[1:pages]

  ## foreach(cou = namecou) %dopar% splitSdmxRead(loc = cou, codelist = codelist, destdir = file.path(origdir, src_id))
  ## estate_list <-
  ##   lapply(file.path(htmldir, htmlfiles), processEnalquiler)
  estate_list <-
    foreach(file = file.path(htmldir, htmlfiles)) %dopar% processEnalquiler(url = file)
  estate_df <- do.call("rbind", estate_list)
  return(estate_df)
}

estatedf <- combineEstate(htmldir = htmldir, pages = npages)
knitr::kable(head(estatedf))
## head(estatedf)
write.table(estatedf, file = "../inst/extdata/enalquiler-2017-10-04.tsv", row.names = FALSE, sep = "\t")
## saveRDS(estatedf, file = "inst/extdata/enalquiler.rds")
## save(estatedf, file = "inst/extdata/enalquiler.rda")
estatedf <- read.table(file = "/home/xps13/src/R/estate/inst/extdata/enalquiler-2017-10-04.tsv", header = TRUE, sep = "\t")

data_plot <-
  estatedf %>%
  filter(price < 1000 &
         bath < 3)

ggplot(data = data_plot,  aes(x = price, y = size, color = factor(bath))) +
  geom_point() +
  geom_smooth(method = "lm") +
  ggtitle("size: m^2, price: euro") +

  theme(
    legend.position = "top"
  )
str(estatedf)
mod1 <- lm(price ~ size * bath, data = data_plot)
summary(mod1)
knitr::kable(head(estatedf))


bowerth/estate documentation built on May 13, 2019, 12:37 a.m.