#' @title Parse metadata from html of classifieds
#' @description Get useful metadata from html files
#' @param x html of classifieds
#' @return list of metadata
#' @importFrom magrittr "%>%"
#' @export
parse_meta <- function(x){
title <- x %>% rvest::html_node("head title") %>% rvest::html_text()
description <- x %>% rvest::html_nodes("head meta[name=description]") %>%
rvest::html_attr("content")
prev_path <- x %>% rvest::html_nodes("head link[rel=prev]") %>%
rvest::html_attr("href")
if (length(prev_path) == 0) prev_path <- as.character(NA)
next_path <- x %>% rvest::html_nodes("head link[rel=next]") %>%
rvest::html_attr("href")
if (length(next_path) == 0) next_path <- as.character(NA)
canonical_path <- x %>% rvest::html_nodes("head meta[name=x-canonical-url]") %>%
rvest::html_attr("content")
results <- x %>% rvest::html_nodes("div.result-text") %>%
xml2::xml_find_all(".//span[not(@title)]") %>%
rvest::html_text() %>%
stringi::stri_replace_all_regex(pattern = "\\D", replacement = "") %>%
as.numeric()
has_no_result <- x %>% rvest::html_nodes("div.result-text") %>%
rvest::html_text() %>%
grepl(pattern = "ilan bulunamad")
if (length(results) == 0 && has_no_result) results <- 0
if (length(results) == 0 && !has_no_result) results <- as.numeric(NA)
pages <- x %>% rvest::html_node("p.mbdef") %>% rvest::html_text() %>%
stringi::stri_extract_first_regex("\\d+") %>%
as.numeric()
# if (is.na(pages) && !is.na(results)) pages <- 1
current_page <- x %>% rvest::html_node(".currentPage") %>% rvest::html_text() %>%
as.numeric()
# if (is.na(current_page) && !is.na(results)) current_page <- 1
paging_size <- x %>% rvest::html_nodes("ul.faceted-sort-buttons.sort-size-menu") %>%
rvest::html_nodes("span") %>%
rvest::html_text() %>%
stringi::stri_trim_both() %>%
as.numeric()
if (length(paging_size) == 0) paging_size <- as.numeric(NA)
rank_first_result <- min((paging_size * (current_page - 1) + 1), results, na.rm = T)
rank_last_result <- min(paging_size * current_page, results, na.rm = T)
res <- list(
title = title,
description = description,
prev_path = prev_path,
next_path = next_path,
canonical_path = canonical_path,
results = results,
pages = pages,
paging_size = paging_size,
current_page = current_page,
rank_first_result = rank_first_result,
rank_last_result = rank_last_result
)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.