R/parse_classifieds.R

#' @title Parse classifieds in tabular form from html of classifieds
#' @description Get classifieds from html files
#' @param x html of classifieds
#' @return data frame of classifieds
#' @importFrom magrittr "%>%"
#' @export
parse_classifieds <- function(x){
  # first row
  header <- x %>%
    rvest::html_node(css = "#searchResultsTable") %>%
    rvest::html_node("thead") %>% 
    rvest::html_nodes("td") %>% 
    rvest::html_text() %>%
    stringi::stri_trim_both()
  
  header_names <- header %>% 
    toupper() %>%
    tolower() %>%
    stringi::stri_replace_all(replacement = " ", regex = "\\W+") %>%
    stringi::stri_trans_general("nfd; [:nonspacing mark:] remove; nfc") %>%
    stringi::stri_trim_both() %>%
    make.names(unique = T) %>%  
    stringi::stri_replace_all(replacement = "_", regex = "\\.") %>% 
    tolower()
  
  rows <- x %>%
    rvest::html_node(css = "#searchResultsTable") %>%
    rvest::html_nodes("tr.searchResultsItem[data-id]") 
  
  row_to_table <- function(x){
    df <- x %>% rvest::html_nodes("td") %>%
      rvest::html_text(trim = T) %>%
      stringi::stri_replace_all(replacement = " ", regex = "\\s+") %>%
      as.list() %>%
      as.data.frame(col.names = header_names, stringsAsFactors = F)
    base_url <- "https://www.sahibinden.com"
    classified_path <- x %>% rvest::html_node("td.searchResultsLargeThumbnail a") %>%
      rvest::html_attr('href')
    df$classified_url <- ifelse(is.na(classified_path), NA, paste0(base_url, classified_path))
    store_path <- x %>% rvest::html_node(".titleIcon.store-icon") %>%
      rvest::html_attr('href')
    df$store_url <- ifelse(is.na(store_path), NA, store_path)
    df$id <- x %>% rvest::html_attr('data-id')
    return(df)
  }
  
  xml2::xml_find_all(rows, ".//br") %>% xml2::xml_add_sibling("p", "\n")
  classifieds <- lapply(rows, row_to_table) %>% dplyr::bind_rows()
  return(classifieds)
}
bhakyuz/sahibinden documentation built on June 12, 2019, 2:28 p.m.