R/parse_pages.R

#' Code to parse trading post ad
#'
#' @param file trading post ad
#'
#' @return data frame with data
#' @importFrom magrittr "%>%"
#' @export
#'
#' @examples
#' parse_trading("inst/rawdata/webpages/tradingpost-cat-1-1-2015-10-08.html")
parse_trading <- function(file)
{
  main_page <- xml2::read_html(file)
  title <- get_attribute(main_page, "title")
  description <- get_attribute(main_page, ".collapse-details")[1]
  keys <- main_page %>% rvest::html_nodes(".spec-list dt") %>%
    rvest::html_text()
  values <- main_page %>% rvest::html_nodes(".spec-list dd") %>%
    rvest::html_text()
  values <- data.frame(t(values))
  colnames(values) <- keys
  data <- data.frame(title, values, description, file)
  return(data)
}
#' function to parse gumtree
#'
#' @param file gumtree advert html
#'
#' @return data frame of info
#' @export
#'
#' @examples
#' parse_gumtree("inst/rawdata/webpages/gumtree-dog-1-1-2015-10-08.html")
parse_gumtree <- function(file){
  main_page <- xml2::read_html(file)
  title <- get_attribute(main_page, "#ad-title")
  dob <- get_attribute(main_page, "#c-dogs_puppies\\.birthdate_tdt")
  offered <- get_attribute(main_page, "#c-dogs_puppies\\.petofferedby_s")
  price <- get_attribute(main_page, "#ad-price")
  location <- get_attribute(main_page, "#ad-map")
  phone <- get_attribute(main_page, "#ad-phone")
  description <- get_attribute(main_page, "#ad-description")
  lat_long <- get_lat_long(main_page)
  date_listed <- get_date_listed(main_page)
  data <- dplyr::data_frame(title, dob, offered, price,
                     location, phone, description,
                     lat = lat_long[1],
                     long = lat_long[2],
                     date_listed, file)
  return(data)
}
#' parse petrescue advert
#'
#' @param file html file
#'
#' @return data frame of info
#' @export
#' @importFrom magrittr "%>%"
#'
#' @examples
#' parse_petrescue("inst/rawdata/webpages/petrescue-cat-1-1-2015-10-08.html")
#' parse_petrescue("inst/rawdata/webpages/petrescue-dog-1-1-2015-10-08.html")
parse_petrescue <- function(file)
{
  main_page <- xml2::read_html(file)
  title <- get_attribute(main_page,".species")
  location <- get_attribute(main_page, ".located_in")
  description <- get_attribute(main_page, ".personality")
  keys <- main_page %>% rvest::html_nodes(".pets-details dt") %>%
    rvest::html_text()
  values <- main_page %>% rvest::html_nodes(".pets-details dd") %>%
    rvest::html_text()
  values <- data.frame(t(values))
  colnames(values) <- keys
  data <- data.frame(title,
                     location,
                     description,
                     values)
  return(data)
}
jonotuke/adParser documentation built on May 19, 2019, 8:34 p.m.