R/strip.R

Defines functions run_strip

#' @export

run_strip <- function(report_after = 1000, 
                       html_files_path = "/Volumes/Brett/TrendLock Data/") {
  
  # gets list of all files 
  raw_file_nms <- NULL; raw_file_nms_single <- NULL
  for (n in list.files(html_files_path)){
    
    to_add <- list.files(paste0(html_files_path, n))
    raw_file_nms_single <- c(raw_file_nms_single, to_add)
    to_add <- paste(n, to_add, sep = "/")
    raw_file_nms <- c(raw_file_nms, to_add)
  }
  raw_file_numbs_single <- gsub(".html","",raw_file_nms_single)
  df_files <- tibble(raw_file_nms, raw_file_numbs_single)
  
  
  # gets order numbers already stripped
  stripped_order_path <- "/Users/Ross/Dropbox (Personal)/TrendLock/data/revel/item level/raw/"
  full_stripped <- NULL
  for ( n in list.files(stripped_order_path, full.names = T)) {
    in_file <- read_feather(n)
    full_stripped <- bind_rows(full_stripped, in_file)
  }
  stripped_numbs <- full_stripped$Order.No
  
  # filters out already stripped orders
  df_files_reduced <- df_files %>%
    filter(!raw_file_numbs_single %in% stripped_numbs)
  
  raw_file_nms <- rev(df_files_reduced$raw_file_nms)
  
  item_level_df <-  NULL; counter <- 1; max_cols <- 0
  
  for (n in raw_file_nms){
    message(n)
    
    html_response <- read_html(paste0(html_files_path,n))
    if(void_test(html_response) == 0) {
      message("Voided Order")
      next()
    }
    if(void_test2(html_response) > 0 ) {
      message("Voided item in order?")
      next()
    }
    if(length(staff_meal_test(html_response)) == 0) {
      message("Staff meal?")
      next()
    }
    
    
    items_contents <- scrape(html_response)
 
    item_level_df <- bind_rows(item_level_df, items_contents)
    
    counter <- counter + 1
    
    if(counter %% report_after == 0){
      print(paste0("Item count is: ",length(item_level_df$Order.No), " and the Orders count is ", length(unique(item_level_df$Order.No)), " rows long"))
      print("Caching items df...")
      write_feather(item_level_df, 
                    paste0("/Users/Ross/Dropbox (Personal)/TrendLock/data/revel/item level/raw/item_level_data",counter,".feather"))
      item_level_df <-  NULL
      if(dir.exists("/Users/Ross/Dropbox (Personal)/TrendLock/data/error reports/scrape_errors.csv")){
        error_counter <- suppressMessages(read_csv("/Users/Ross/Dropbox (Personal)/TrendLock/data/error reports/scrape_errors.csv"))
        print(paste0("Error count is: ", length(error_counter$Order.Num))); error_counter <- NULL
      }
    }
    
    
    
  }
  print(paste0("Finished: saving items df...", "max cols were: ",max_cols))
  write_feather(item_level_df, paste0("/Users/Ross/Dropbox (Personal)/TrendLock/data/items/item_level_data",counter,".feather"))
}
rosseji/scraper documentation built on June 7, 2017, 12:59 p.m.