idealisto: Web scrapper for idealista.com

#' Scrap idealista website.
#' 
#' This function scraps idealista (a spanish real estate website) and downloads a maximum of 1.800 rent ads in the given province, city, district or neighborhood. So if you want to scrap an area that has more than 1.800 ads use idealisto function.
#' @param url An idealista website url that links to the area you want to scrap, e.g. 'https://www.idealista.com/alquiler-viviendas/madrid/arganzuela/'.
#' @param ads character. Specify if the url links to rent ads or for sale ads. The argument accepts the following strings: "rent" or "sale".
#' @param ruta A valid path in your computer where you want to create the csv file.
#' @return It returns a csv in the specified path
#' @export
get_fast <- function(url, ads, ruta = "~/idealisto_fast.csv") {
  start <- Sys.time()
  
  list.of.packages <- c("stringr", "rvest", "httr")
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)>0) {install.packages(new.packages)}
  
  library(stringr)
  library(rvest)
  library(httr)
  
  desktop_agents <-  c('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
                       'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                       'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
                       'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
  
  
  
  url_base <- url       ## Defino la url base
  
  #### CALCULO CUANTAS PÁGINAS TIENE EL AREA SOLICITADA
  x <- GET(url, add_headers('user-agent' = desktop_agents[sample(1:10, 1)]))
  anuncios <- x %>% read_html() %>% html_nodes(".h1-simulated") %>% html_text()
  anuncios <- str_remove(anuncios, "\\.")
  q <- round(as.numeric(anuncios)/30)
  
  ####  CREO EL ÍNDICE DE PÁGINAS
  times <- 1:q                ## Creo un vector con los números del 1 al n para cambiar las páginas de la web
  url <- c(replicate(expr = paste0(url_base, "pagina-", times,".htm"), n = 1))  ##Construyo el string con las urls definitivas. Una para cada página a scrapear
  print(url)
  #url <- c(url, replicate(expr = paste0(url_base), n = 1))
  
  
  links_anuncios_tot <- c()
  
  #### EXTRAIGO LOS LINKS DE TODOS LOS ANUNCIOS DEL AREA
  for (p in 1:length(url)) {
    x <- GET(url[p], add_headers('user-agent' = desktop_agents[sample(1:10, 1)]))
    links <- x %>% read_html() %>% html_nodes(".item-link") %>% html_attr(name = "href", default = NA)
    
    links_anuncios <- paste0("https://www.idealista.com", links)
    links_anuncios_tot <- c(links_anuncios_tot, links_anuncios)
    links_anuncios_tot <- unique(links_anuncios_tot)
    
    print(links_anuncios_tot)
    print("Capturando los links a todos los anuncios...")
  }
  
  
  links_anuncios_tot <<- links_anuncios_tot
  
  line <- data.frame("Titulo", "Distrito", "Barrio", "calle", "Precio", "Precio_m2", "Superficie", "Habitaciones", "Descripcion", "Anunciante", "Agencia", "Url", "fecha")
  
  write.table(line, file = ruta, sep = ",", quote = FALSE, col.names = FALSE, row.names = FALSE, na = "")
  
  n <- length(links_anuncios_tot)
  
  print(paste("Idealisto ha extraido las urls de", n, "anuncios."))
  print(paste("Ahora Idealisto comenzará a extraer la información de esos", n, "anuncios. Pero antes vamos a descansar durante 30 segundos"))
  
  Sys.sleep(30)
  
  
  start_2 <- Sys.time()
  
  
  
  ######  Empieza a scraper cada uno de los anuncios
  
  if (silent == TRUE) {
    print("Comienza la extracción de los anuncios...")
  }
  
  for (p in 1:length(links_anuncios_tot)) {     
    x <- GET(links_anuncios_tot[p], add_headers('user-agent' = desktop_agents[sample(1:10, 1)]))
    
    titulo <- x %>% read_html() %>% html_nodes(".main-info__title-main") %>% html_text()
    precio <- x %>% read_html() %>% html_nodes(".h1-simulated") %>% html_text()
    ubi <- x %>% read_html() %>% html_nodes("#headerMap li") %>% html_text()
    anunciante <- x %>% read_html() %>% html_nodes(".name") %>% html_text()
    agencia <- x %>% read_html() %>% html_nodes(".about-advertiser-name") %>% html_text()
    info <- x %>% read_html() %>% html_nodes(".info-features") %>% html_text()
    descrip <- x %>% read_html() %>% html_nodes(".expandable") %>% html_text()
    alta <- x %>% read_html() %>% html_nodes("#stats > p") %>% html_text()
    
    info <- str_trim(info, "both")
    info <- str_split(info, "   ")
    info <- unlist(info)
    
    metros <- info[str_detect(info, "m²") == TRUE]
    metros <- str_replace_all(string = metros, pattern = " m²| |\\.", replacement = "")
    metros <- as.numeric(metros)
    
    habit <- info[str_detect(info, "hab\\.") == TRUE]
    habit <- str_remove(habit, "hab\\.")
    habit <- as.integer(habit)
    
    descrip <- str_replace_all(descrip, pattern = '\"', "")
    
    fecha <- Sys.Date()
    
    ## Bucle para limpiar el campo del precio en función si es anuncio de venta o de alquiler
    
    
    try(if (ads == "rent") {
      
      if (str_detect(precio, "eur") == TRUE) {
        try(precio <- as.integer(str_replace_all(string = precio, pattern = " eur/mes|\\.", replacement = "")))
      } else if (str_detect(precio, "€") == TRUE) {
        try(precio <- as.integer(str_replace_all(string = precio, pattern = " €/mes|\\.", replacement = "")))
      }
      
    } else if (ads == "sale") {
      
      if (str_detect(precio, "eur") == TRUE) {
        try(precio <- as.integer(str_replace_all(string = precio, pattern = " eur|\\.", replacement = ""))) 
      } else if (str_detect(precio, "€") == TRUE) {
        try(precio <- as.integer(str_replace_all(string = precio, pattern = " €|\\.", replacement = "")))
      }
    })
    
    ####
    
    try(precio_m2 <- precio/metros)
    
    
    
    if (length(titulo) == 0) {
      titulo <- "Sin título"
    }
    
    if (length(precio) == 0) {
      precio <- NA
    }
    
    if (length(descrip) == 0) {
      descrip <- "Sin descripción"
    } else if (length(descrip > 1)) {
      descrip <- descrip[1]
    }
    
    if (length(agencia) == 0) {
      agencia <- NA
    }
    
    if (length(anunciante) == 0 | isTRUE(anunciante == " ")) {
      anunciante <- "Particular"
    }
    
    calle <- ubi[1]
    barrio <- as.character(ubi[str_detect(ubi, pattern = "Barrio ") == TRUE])
    
    distrito <- as.character(ubi[str_detect(ubi, pattern = "Distrito") == TRUE])
    distrito <- str_replace_all(string = distrito, pattern = "Distrito ", replacement = "")
    
    if (length(distrito) == 0) {
      distrito <- NA
    }
    
    if (length(barrio) == 0) {
      barrio <- NA
    }
    
    if (length(calle) == 0) {
      calle <- NA
    }
    
    
    try(line <- data.frame(titulo[1], distrito[1], barrio[1], calle[1], precio[1], precio_m2[1], metros[1], habit[1], descrip[1], anunciante[1], agencia[1], links_anuncios_tot[p], alta[1], fecha[1]))
    
    if (silent == FALSE) {
      print(line)
    }
    
    
    if (silent == FALSE) {
      n <- n - 1
      process <- 100 - ((n/length(links_anuncios_tot))*100)
      print(paste0("Idealisto lleva descargados el ", round(process, digits = 1),"% de los anuncios."))
    }
    
    
    try(write.table(line, file = ruta, sep = ",", append = TRUE, quote = TRUE, col.names = FALSE, row.names = FALSE, na = ""))    
    #########
    
    
    
    if (Sys.time() > start_2 + 420) {
      stop_t <- sample(x = 100:120, size = 1)
      print(paste("Para que no se nos cabree Idealista vamos a parar la máquina durante", stop_t, "segundos."))
      Sys.sleep(time = stop_t)
      start_2 <- Sys.time()
    }
    
    Sys.sleep(sample(x = 1:3, size = 1))
  }
  
  end <- Sys.time()
  diff <- difftime(end, start, units = "auto")
  print(paste("Idealisto ha descargado los", length(links_anuncios_tot), "anuncios."))
  print(diff)
}