R/download_adverts.R

#' Takes a main page for given site and returns the advert links on it
#'
#' @param url main url page
#' @param page website - gumtree, tradingpost, or petrescue
#'
#' @return list of urls
#' @export
#'
#' @examples
#'get_ad_urls("http://www.gumtree.com.au/s-dogs-puppies/page-01/c18434", "gumtree")
get_ad_urls <- function(url, page)
{
  main_page <- xml2::read_html(url)
  if(page == "gumtree")
  {
    urls <- main_page %>% rvest::html_nodes(".rs-ad-info") %>%
      rvest::html_nodes("a") %>% rvest::html_attr("href")
  }
  else if(page == "tradingpost")
  {
#     urls <- main_page %>% rvest::html_nodes(".search-item-link") %>%
#       rvest::html_nodes("a") %>%
#       rvest::html_attr("href") %>%
#       unique()
    urls <- main_page %>%
      rvest::html_nodes(".thumbnail-item") %>%
      rvest::html_nodes("a") %>%
      rvest::html_attr("href")
  }
  else if(page == "petrescue")
  {
    if(stringr::str_detect(url, "dog"))
    {
      urls <- main_page %>% rvest::html_nodes(".dog-listing") %>%
        rvest::html_node("a") %>% rvest::html_attr("href")
    }
    else
    {
      urls <- main_page %>% rvest::html_nodes(".cat-listing") %>%
        rvest::html_node("a") %>% rvest::html_attr("href")
    }
  }
  else
  {
    stop("Do not know that site")
  }
  return(urls)
}

#' Gets all the ads for a given webpage and saves them to a folder
#'
#' @param n number of pages to get
#' @param page gumtree, tradingpost or petrescue
#' @param animal cat or dog
#' @param dir folder to save pages in
#' @param save_html boolean if true save to folder \code{dir}. If false then
#' just prints to screen
#' @param trace prints out urls as gets them
#'
#' @return NULL
#' @export
#' @importFrom magrittr "%>%"
#'
#' @examples
#' get_ads(1, "gumtree", "dog", "~/Desktop/")
get_ads <- function(n, page, animal, dir, save_html = TRUE, trace = TRUE){
  # Get site information
  sites_info <- lookup
  site_info <- sites_info %>%
    dplyr::filter(webpage == page, type == animal)
  # Set counter
  counter = 0

  if(nrow(site_info) != 1)
  {
    if(nrow(site_info)==0)
    {
      stop("No info for that page animal combo")
    }
    else
    {
      print(site_info)
      stop("Two many rows")
    }
  }
  # Get each main page
  stem <- site_info$stem
  for (i in 1:n)
  {
    url <- stringr::str_replace(stem, "XX", i)
    if(RCurl::url.exists(url))
    {
      # get each url for advert on main page
      urls <- get_ad_urls(url, page = page)
      for(j in 1:length(urls))
      {
        if(trace)
        {
          print(urls[j])
        }
        advert <- paste0(site_info$ad_stem, urls[j])
        if(trace)
        {
          print(advert)
        }
        # Get add and save
        page_html <- RCurl::getURL(advert)
        if(trace)
        {
          print(page_html)
        }
        filepath <- file.path(dir, paste0(page,"-",animal, "-",i,"-", j,"-",
                                          lubridate::today(),".html"))
        counter = counter + 1
        if(save_html)
        {
          writeLines(page_html, filepath)
        }
      }
    }
  }
  return(counter)
}
jonotuke/adParser documentation built on May 19, 2019, 8:34 p.m.