R/search_for_sale_batch.R

#' @title Get search results for for 'satilik'
#' @description Get search results for for 'satilik'
#' @param address_town id is of towns. Omited if district provided 
#' @param districts df of districts/quarters to search for. For details \link{get_districts}
#' @param write_dir Dir to write hmtl files and logs under random dir name, if NA, nothing is written to disk 
#' @param sleep how long should wait before getting results of next page, in seconds 
#' @return list of following elements:
#' @export
#' @importFrom utils "write.csv"
#' @examples
#' \dontrun{
#' d  <- get_districts(address_town = 421)
#' s <- search_for_sale_batch(districts = d[1,], write_dir = NA, sleep = 5)
#' } 
search_for_sale_batch <- function(
  address_town = 421,
  districts,
  sleep = 20,
  write_dir = getwd()
  ){
  temp_dir  <- stringi::stri_rand_strings(n = 1, length = 10)
  sts <- Sys.time()
  
  if (missing(districts)) {
    districts <- get_districts(address_town = address_town)
  } else {
    districts <- districts
  }
  next_url <- NA
  idx_row <- 1
  idx_page <- 0
  logs <- dplyr::tibble()
  contents <- list()
  
  cat(paste("Getting searches from", paste(unique(districts$town_city_name),unique(districts$town_name),sep = " ", collapse =  ", "), "\n"))
  while (!is.na(next_url) || idx_row <= nrow(districts)) {
    idx_page <- idx_page + 1
    if (is.na(next_url)) {
      s <- search_for_sale(address_quarter = districts$quarter_id[idx_row])
    } else {
      s <- search_for_sale(page_url = next_url)
    }
    
    if (!is.na(write_dir)) {
      if (!dir.exists(file.path(write_dir,  temp_dir))) dir.create(file.path(write_dir,  temp_dir))
      xml2::write_html(x = s$content, file = file.path(write_dir,  temp_dir, paste0(s$hashed_url,".html")))
    }
    
    avg_page_per_quarter <- idx_page / idx_row 
    remaining_pages <- avg_page_per_quarter * (nrow(districts) - idx_row) + 
      (max(s$meta$pages, 1, na.rm = T) - max(s$meta$current_page, 1, na.rm = T))
    
    total_pages <- remaining_pages + (idx_row * avg_page_per_quarter)
    avg_second_per_page <- as.numeric(difftime(Sys.time(), sts, units = "secs"), "secs") / idx_page
    
    text_to_print <- 
      paste(
        "Overal:", 
        paste0(round(idx_page / total_pages * 100, 2), "% ", "completed,", sep = ""),
        "ETA: ", Sys.time() + (avg_second_per_page * remaining_pages + sleep),
        ">> Quarter", idx_row, "/", nrow(districts), "#",
          districts$name[idx_row], "-",districts$quarter_name[idx_row],
          "Page:",
          max(s$meta$current_page, 1, na.rm = T), "/", max(s$meta$pages, 1, na.rm = T),
          sep = " ")
    # width <- getOption("width") - 1
    width <- max(as.integer(system("tput cols", intern = TRUE)), getOption("width")) - 1
    cat("\r", strrep(".", width))
    cat("\r", substr(text_to_print, 1, width))

    if (is.na(s$next_page_url)) {
      idx_row <- idx_row + 1
      next_url <- NA
    } else {
      next_url  <- s$next_page_url
    }    
    l <- dplyr::tibble(
      timestamp = Sys.time(),
      hashed_url = s$hashed_url,
      url = s$url,
      prev_page_url = s$prev_page_url,
      next_page_url = s$next_page_url,
      title = s$meta$title,
      description = s$meta$description,
      paging_size = s$meta$paging_size,
      pages = s$meta$pages,
      current_page = s$meta$current_page,
      results = s$meta$results,
      rank_last_result = s$meta$rank_last_result
    )
    logs <- dplyr::bind_rows(logs, l)
    
    if (!is.na(write_dir)) {
      if (!dir.exists(file.path(write_dir,  temp_dir))) dir.create(file.path(write_dir,  temp_dir))
      write.csv(x = logs, file = file.path(write_dir,  temp_dir, "logs.csv"), na = "", row.names = F)
    }
    
    remove(l)
    contents[length(contents) + 1] <- list(s$content)
    Sys.sleep(sleep)
  }
  cat("\n")
  if (!is.na(write_dir)) {
    cat(paste0("HTML files and logs can be found in ", file.path(write_dir,  temp_dir), "\n"))
  }
  
  return(list(logs = logs, contents = contents))
}
bhakyuz/sahibinden documentation built on June 12, 2019, 2:28 p.m.