R/parser_fn.R

#' takes a css attribute and web object and returns value
#'
#' @param main_page html object from rvest
#' @param css attribute to filter on
#'
#' @return text
#' @importFrom magrittr "%>%"
#' @export
#'
#' @examples
#' page <- xml2::read_html("inst/rawdata/webpages/gumtree-cat-1-1-2015-10-08.html")
#' get_attribute(page, "#ad-title")
get_attribute <- function(main_page, css){
  tmp = try(main_page %>% rvest::html_nodes(css) %>%
              rvest::html_text() %>% stringr::str_trim(),silent = TRUE)
  if(class(tmp) == "try-error"){
    return(NA)
  }
  if(length(tmp)==0){
    return(NA)
  }
  tmp <- clean_string(tmp)
  return(tmp)
}
#' get lat long from gumtree
#'
#' @param main_page html object
#'
#' @return vector of lat long
#' @export
#' @importFrom magrittr "%>%"
#'
#' @examples
#' page <- xml2::read_html("inst/rawdata/webpages/gumtree-cat-1-1-2015-10-08.html")
#' get_lat_long(page)
get_lat_long <- function(main_page){
  lat <- try(main_page %>% rvest::html_nodes(".c-pointer") %>%
               rvest::html_attr("data-lat")%>%.[1], silent = TRUE)
  if(class(lat)=="try-error" | length(lat)==0){
    lat <- NA
  }
  long <- try(main_page %>% rvest::html_nodes(".c-pointer") %>%
                rvest::html_attr("data-lng")%>%.[1], silent = TRUE)
  if(class(long)=="try-error" | length(long)==0){
    long <- NA
  }
  return(c(lat,long))
}
#' get date listed from gumtree
#'
#' @inheritParams get_lat_long
#'
#' @return date
#' @export
#'
#' @examples
#' page <- xml2::read_html("inst/rawdata/webpages/gumtree-cat-1-1-2015-10-08.html")
#' get_date_listed(page)
get_date_listed <- function(main_page){
  tmp = try(main_page %>% rvest::html_nodes(".ad-attribute") %>%
              rvest::html_nodes("dd") %>%
              rvest::html_text() %>% stringr::str_trim()%>%.[1],silent = TRUE)
  if(class(tmp) == "try-error"){
    return(NA)
  }
  if(length(tmp)==0){
    return(NA)
  }
  return(tmp)
}
#' clean string
#'
#' @param str a character string to be cleaned
#'
#' @return str without \\r and \\n or excess space
#' @export
#'
#' @examples
#' clean_string("bob\r\n")
clean_string <- function(str)
{
  str <- stringr::str_replace_all(str, "\r", "")
  str <- stringr::str_replace_all(str, "\n", "")
  str <- stringr::str_replace_all(str, "  ", " ")
  str <- stringr::str_replace_all(str, " $", "")
  return(str)
}
jonotuke/adParser documentation built on May 19, 2019, 8:34 p.m.