R/parse_meta.R

#' @title Parse metadata from html of classifieds
#' @description Get useful metadata from html files
#' @param x html of classifieds
#' @return list of metadata
#' @importFrom magrittr "%>%"
#' @export
parse_meta <- function(x){
  
  title <- x %>% rvest::html_node("head title") %>% rvest::html_text()
  description <- x %>% rvest::html_nodes("head meta[name=description]") %>% 
    rvest::html_attr("content")
  
  prev_path <- x %>% rvest::html_nodes("head link[rel=prev]") %>% 
    rvest::html_attr("href")
  if (length(prev_path) == 0) prev_path <- as.character(NA)
  next_path <- x %>% rvest::html_nodes("head link[rel=next]") %>% 
    rvest::html_attr("href")
  if (length(next_path) == 0) next_path <- as.character(NA)
  canonical_path <- x %>% rvest::html_nodes("head meta[name=x-canonical-url]") %>% 
    rvest::html_attr("content")
  
  results <- x %>% rvest::html_nodes("div.result-text") %>% 
     xml2::xml_find_all(".//span[not(@title)]") %>% 
     rvest::html_text() %>% 
     stringi::stri_replace_all_regex(pattern = "\\D", replacement = "") %>% 
     as.numeric()
  
  has_no_result <- x %>% rvest::html_nodes("div.result-text") %>%
    rvest::html_text() %>%
    grepl(pattern = "ilan bulunamad")
  
  if (length(results) == 0 && has_no_result) results <- 0
  if (length(results) == 0 && !has_no_result) results <- as.numeric(NA)
  
  pages <- x %>% rvest::html_node("p.mbdef") %>% rvest::html_text() %>%
   stringi::stri_extract_first_regex("\\d+") %>% 
   as.numeric()
  # if (is.na(pages) && !is.na(results)) pages <- 1
    
  current_page <- x %>% rvest::html_node(".currentPage") %>% rvest::html_text() %>% 
    as.numeric()
  # if (is.na(current_page) && !is.na(results)) current_page <- 1
  
  paging_size <- x %>% rvest::html_nodes("ul.faceted-sort-buttons.sort-size-menu") %>% 
    rvest::html_nodes("span") %>%
    rvest::html_text() %>%
    stringi::stri_trim_both() %>%
    as.numeric()
  if (length(paging_size) == 0) paging_size <- as.numeric(NA)
  
  rank_first_result <- min((paging_size * (current_page - 1) + 1), results, na.rm = T)
  rank_last_result <- min(paging_size * current_page, results, na.rm = T)
  
  res <- list(
    title = title, 
    description = description,
    prev_path = prev_path,
    next_path = next_path,
    canonical_path = canonical_path,
    results = results,
    pages = pages,
    paging_size = paging_size,
    current_page = current_page,
    rank_first_result = rank_first_result,
    rank_last_result = rank_last_result
    )
  return(res)
}
bhakyuz/sahibinden documentation built on June 12, 2019, 2:28 p.m.