R/parse_folder.R

#' Folder of ads to data frame
#'
#' Takes a folder of webpages with adverts. Filters based on date, site and
#' animal. Parses and returns data frame
#'
#' @param dir folder with advert htmls
#' @param page site to filter on
#' @param animal animal to filter on
#' @param date data to filter on
#'
#' @return data frame
#' @export
#'
#' @examples
#' parse_folder(dir = "inst/rawdata/webpages/", page = "gumtree",animal = "dog", date = "2015")
parse_folder <- function(dir, page, animal, date)
{
  files <- list.files(dir)
  files <- files[stringr::str_detect(files, page)]
  files <- files[stringr::str_detect(files, animal)]
  files <- files[stringr::str_detect(files, date)]
  data <- list()
  for(i in files)
  {
    file <- file.path(dir,i)
    if(page == "gumtree")
    {
      tmp <- parse_gumtree(file)
    }
    else if(page == "tradingpost")
    {
      tmp <- parse_trading(file)
    }
    else if(page == "petrescue")
    {
      tmp <- parse_petrescue(file)
    }
    else
    {
      stop("Webpage not known")
    }
    tmp$page <- page
    tmp$animal <- animal
    tmp$date <- date
    data[[i]] <- tmp
  }
  data <- dplyr::bind_rows(data)
  return(data)
}
jonotuke/adParser documentation built on May 19, 2019, 8:34 p.m.