#' Takes a main page for given site and returns the advert links on it
#'
#' @param url main url page
#' @param page website - gumtree, tradingpost, or petrescue
#'
#' @return list of urls
#' @export
#'
#' @examples
#'get_ad_urls("http://www.gumtree.com.au/s-dogs-puppies/page-01/c18434", "gumtree")
get_ad_urls <- function(url, page)
{
main_page <- xml2::read_html(url)
if(page == "gumtree")
{
urls <- main_page %>% rvest::html_nodes(".rs-ad-info") %>%
rvest::html_nodes("a") %>% rvest::html_attr("href")
}
else if(page == "tradingpost")
{
# urls <- main_page %>% rvest::html_nodes(".search-item-link") %>%
# rvest::html_nodes("a") %>%
# rvest::html_attr("href") %>%
# unique()
urls <- main_page %>%
rvest::html_nodes(".thumbnail-item") %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href")
}
else if(page == "petrescue")
{
if(stringr::str_detect(url, "dog"))
{
urls <- main_page %>% rvest::html_nodes(".dog-listing") %>%
rvest::html_node("a") %>% rvest::html_attr("href")
}
else
{
urls <- main_page %>% rvest::html_nodes(".cat-listing") %>%
rvest::html_node("a") %>% rvest::html_attr("href")
}
}
else
{
stop("Do not know that site")
}
return(urls)
}
#' Gets all the ads for a given webpage and saves them to a folder
#'
#' @param n number of pages to get
#' @param page gumtree, tradingpost or petrescue
#' @param animal cat or dog
#' @param dir folder to save pages in
#' @param save_html boolean if true save to folder \code{dir}. If false then
#' just prints to screen
#' @param trace prints out urls as gets them
#'
#' @return NULL
#' @export
#' @importFrom magrittr "%>%"
#'
#' @examples
#' get_ads(1, "gumtree", "dog", "~/Desktop/")
get_ads <- function(n, page, animal, dir, save_html = TRUE, trace = TRUE){
# Get site information
sites_info <- lookup
site_info <- sites_info %>%
dplyr::filter(webpage == page, type == animal)
# Set counter
counter = 0
if(nrow(site_info) != 1)
{
if(nrow(site_info)==0)
{
stop("No info for that page animal combo")
}
else
{
print(site_info)
stop("Two many rows")
}
}
# Get each main page
stem <- site_info$stem
for (i in 1:n)
{
url <- stringr::str_replace(stem, "XX", i)
if(RCurl::url.exists(url))
{
# get each url for advert on main page
urls <- get_ad_urls(url, page = page)
for(j in 1:length(urls))
{
if(trace)
{
print(urls[j])
}
advert <- paste0(site_info$ad_stem, urls[j])
if(trace)
{
print(advert)
}
# Get add and save
page_html <- RCurl::getURL(advert)
if(trace)
{
print(page_html)
}
filepath <- file.path(dir, paste0(page,"-",animal, "-",i,"-", j,"-",
lubridate::today(),".html"))
counter = counter + 1
if(save_html)
{
writeLines(page_html, filepath)
}
}
}
}
return(counter)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.