scripts/tests2.R

#none
input_html <- read_html(paste0("/Users/Ross/Dropbox (Personal)/TrendLock/data/scrape tests/test html/341937.html"))
# one
input_html <- read_html(paste0("/Users/Ross/Dropbox (Personal)/TrendLock/data/scrape tests/test html/1072656.htm"))
# many
input_html <- read_html(paste0("/Users/Ross/Dropbox (Personal)/TrendLock/data/scrape tests/test html/1072537.htm"))

# old page
input_html <- read_html("/Users/Ross/Dropbox (Personal)/TrendLock/data/scrape tests/test html/1074386.htm")

# collapsing tools ----

items_all <- function(input_html) {
  num <- order_id_single(input_html)
    
  input_html %>% 
    html_nodes("div .order_item_modifiers, .order_item_numbers") %>%
    html_table() %>%
    map(~set_colnames(.[], c("Prices", colnames(.)[-1]))) %>%
    map(~as_tibble(.)) %>%
    map(~mutate(., Order.No = num))
}

mod_test <- function(input_html) {
  input_html %>% 
    html_nodes("div .order_item_modifiers") %>%
    html_table() %>%
    length()
}




# collapsing to one row to per order... hopeful first step scrape fro un breakable data... :/
mas_df <- NULL
for (n in list.files("/Users/Ross/Dropbox (Personal)/TrendLock/data/scrape tests/test html/" , full.names = T)) {
  print(n)
  # could add order numbers here?
  
  df <- n %>%
    read_html() %>%
    items_all() %>%
    list(item = .) %>%
    as_tibble() %>%
    nest(item) 
  df
  mas_df <- bind_rows(mas_df, df)
}

#ideal pulled file
mas_df




# And back the other direction =====
check2 <- mas_df %>%
  flatten() %>%
  flatten() %>%
  flatten() %>%
  map(~flatten(.)) %>%
  map(~tibble(.))

# needs input of list of list of data frames
expand_items <- function(master_df) { # doesnt' take into acct multiple potential mods...
  if(mod_test(input_html) > 0) {
    for (n in 2:length(master_df)){
      print(n)
      # exit if the shrunk list size brings loop to the end...
      if(length(master_df) < n){next()}
      
      df <- master_df[[n]]
      if(colnames(df)[2] == "Modifier"){
        m <- n -1
        df_lag <- master_df[[m]]
        df_len <- length(df$Price)

        if(df_len > 1){
          colnames(df) <- c("Add.Sub","Mod", "Mod.Cost","Mod.Price","Order.No")
          df_out <- right_join(df_lag, df, by = "Order.No")
        } else {
          df_out <- bind_cols(df_lag, df)
        }
        master_df[[m]] <- df_out
        master_df[[n]] <- NULL
      } 
    }
  }
  master_df %>%
    map(~lapply(.,as.character)) %>%
    bind_rows()
}


look <- input_html %>%
  items_all() %>%
  list(item = .) %>%
  as_tibble() %>%
  nest(item)

# flatten out items


# flatten out mods from items


df <- input_html %>%
  items_all() %>%
  expand_items()
rosseji/scraper documentation built on June 7, 2017, 12:59 p.m.