library(RCurl) library(rvest) library(magrittr) library(curl) library(estate) library(doMC) registerDoMC(detectCores(logical = TRUE) - 1) library(ggplot2) library(dplyr) htmldir <- "inst/extdata/enalquiler"
## get number of offers num_per_page <- 15 baseurl <- "http://www.enalquiler.com/search?provincia=43&poblacion=46424&tipo=2" tt <- getURL(baseurl) doc <- read_html(tt) property_num <- doc %>% html_node(".property-list-title-count") %>% html_text() %>% sub("[ a-z]+", "", .) %>% gsub("[()]", "", .) %>% as.integer() npages <- ceiling(property_num / num_per_page) estate::downloadHtml(pages = seq(1, npages), htmldir = htmldir, provider = "enalquiler")
extractEnalquiler <- function(item) { title <- item %>% html_nodes(".property-title") %>% html_text() %>% gsub("\n", "", .) %>% # remove line breaks sub("^[ ]+", "", .) %>% # remove leading whitespace sub("[ ]+$", "", .) # remove trailing whitespace link <- item %>% html_nodes(".property-title") %>% html_attr("href") price <- item %>% html_nodes(".property-price") %>% html_text() %>% gsub("\n", "", .) %>% # remove line breaks sub("[ ]+", "", .) %>% # remove whitespace sub("\u0080", "", .) %>% # remove euro symbol sub("[.]", "", .) # remove thousands seperator size <- item %>% html_nodes(".detail-m2") %>% html_text() %>% gsub("\n", "", .) %>% # remove line breaks sub("^[ ]+", "", .) %>% # remove leading whitespace sub("m2", "", .) bath <- item %>% html_nodes(".detail-bathrooms") %>% html_text() %>% gsub("\n", "", .) %>% # remove line breaks sub("^[ ]+", "", .) %>% # remove leading whitespace sub("[a-zA-Zñ]+", "", .) %>% # remove "Baño(s)" sub("[ ]+$", "", .) # remove trailing whitespace location <- item %>% html_nodes(".property-location") %>% html_text() %>% gsub("\n", "", .) %>% # remove line breaks sub("^[ ]+", "", .) %>% # remove leading whitespace sub("[ ]+$", "", .) # remove trailing whitespace list_out <- list(price = as.integer(price), size = as.integer(size), bath = as.integer(bath), title = title, location = location, link = link) return(list_out) }
processEnalquiler <- function(url) { ## doc <- XML::htmlParse(url, encoding = "utf-8") doc <- xml2::read_html(url) proplist <- doc %>% html_nodes(".property-list") %>% html_nodes(".property-item") res_ls <- lapply(proplist, extractEnalquiler) res_df <- do.call(rbind.data.frame, res_ls) return(res_df) } ## processEnalquiler("inst/extdata/enalquiler/file21d6ab837ba.html")
combineEstate <- function(htmldir, pages=1) { htmlfiles <- list.files(htmldir)[1:pages] ## foreach(cou = namecou) %dopar% splitSdmxRead(loc = cou, codelist = codelist, destdir = file.path(origdir, src_id)) ## estate_list <- ## lapply(file.path(htmldir, htmlfiles), processEnalquiler) estate_list <- foreach(file = file.path(htmldir, htmlfiles)) %dopar% processEnalquiler(url = file) estate_df <- do.call("rbind", estate_list) return(estate_df) } estatedf <- combineEstate(htmldir = htmldir, pages = npages) knitr::kable(head(estatedf)) ## head(estatedf) write.table(estatedf, file = "../inst/extdata/enalquiler-2017-10-04.tsv", row.names = FALSE, sep = "\t") ## saveRDS(estatedf, file = "inst/extdata/enalquiler.rds") ## save(estatedf, file = "inst/extdata/enalquiler.rda")
estatedf <- read.table(file = "/home/xps13/src/R/estate/inst/extdata/enalquiler-2017-10-04.tsv", header = TRUE, sep = "\t") data_plot <- estatedf %>% filter(price < 1000 & bath < 3) ggplot(data = data_plot, aes(x = price, y = size, color = factor(bath))) + geom_point() + geom_smooth(method = "lm") + ggtitle("size: m^2, price: euro") + theme( legend.position = "top" )
str(estatedf) mod1 <- lm(price ~ size * bath, data = data_plot) summary(mod1)
knitr::kable(head(estatedf))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.