## Load package on setup
devtools::load_all()
## Load used packages
library(magrittr)
library(knitr)
library(plotly)

## Path to locally stored csv. files
dataset_filepath <- "~/CloudStation/Projekte/r_projects/GoIT/retailrocket-recommender-system-dataset/"

## Convinience functions
fn <- function(x) prettyNum(x, big.mark = ",", decimal.mark = ".")

## data.fram convinience functions
preview_no <- 8
caption_no <- 0
caption_text <- function(str) {
  caption_no <<- caption_no + 1
  paste0("Table ", caption_no, ". ", str)
}

The data

Preperation

ds <- load_retailrocket_data(dataset_filepath)
properties <- ds$properties
category <- ds$category
events <- ds$events

Dataset loaded from Kaggle/retailrocket challange. Downloaded locally and loaded via load_retailrocket_data(filepath)command. The returned list object holds three data.framesnamely r names(ds). The following sections describe those datatables in greater detail.

rm(ds)
events[, timestamp := as.POSIXct(timestamp/1000, origin = '1970-01-01')]
properties[, timestamp := as.POSIXct(timestamp/1000, origin = '1970-01-01')]

Properties

head(properties, n = preview_no) %>% 
  kable(caption = caption_text(paste("Showing", nrow(.), "rows out of", nrow(properties) %>% fn, "from the properties datatable")))

The properties datatable assignes r properties[, length(unique(property))] %>% fn different properties to the r properties[, length(unique(itemid))] %>% fn unique items over time. All properties are hashed except of the following:

The rest of the categories are useless, so lets drop them.

properties <- properties[property %chin% c("categoryid", "available")]
properties[, value := as.numeric(value)]

Category

head(category, n = preview_no) %>% 
  kable(caption = caption_text(paste("Showing", nrow(.), "rows out of", nrow(category) %>% fn, "from the category datatable")), align = c("c", "c"))

The category datatable holds hierachial category information in a flat table format mapping item-categories to parent categories over multiple levels.
Following checks are performed:

## Check if each categiory has a unique parent relation
category[, .N, by = .(categoryid)][, max(N)] == 1

## Count the number of top-level categories
category[is.na(parentid), .N]

## Count the number of lowest-level categories
category[!category, .N, on = c("categoryid" = "parentid")]

To investigate in the category hierachy we transform the category datatable to a tree structure.

cat_tree <- category[, .(categoryid, parentid = ifelse(is.na(parentid), -1, parentid))] %>% 
  data.tree::FromDataFrameNetwork() %>% 
  data.tree::ToDataFrameTable("pathString", "name") %>% 
  as.data.table %>% 
  .[, categoryPath := stringr::str_replace(pathString, "^-1/", "")] %>% 
  .[, ultimate_category := as.integer(stringr::str_extract(categoryPath, "^\\d+"))] %>% 
  .[, depth := stringr::str_count(pathString, "/")] %>% 
  .[, categoryid := as.integer(name)] %>% 
  .[, .(categoryid, ultimate_category, depth, categoryPath)]

## Maximum depth of catergory tree
(max_depth <- cat_tree[, max(depth)])

## Size of category groups
print(cat_tree[, .N, by = ultimate_category][order(-N)])

split_fill <- function(col) transpose(lapply(strsplit(col, "/"), function(v) c(v, rep(v[length(v)], max_depth - length(v)))))
cat_tree[, paste0("Level_", 1:6) := split_fill(categoryPath)]

print(cat_tree)

Events

head(events)

events[, range(timestamp)]

In the event data set we find r nrow(events) %>% fn events from r events[, length(unique(visitorid))] %>% fn unique customers.

p1 <- plot_ly(events[, .N, by = event][order(-N)], x = ~factor(event, levels = event), y = ~N) %>% 
  add_bars() %>% 
  layout(
    xaxis = list(title = "Count of event"), 
    yaxis = list(title = "")
  )

p2 <- events[event == "transaction", .(tn = .N), by = .(visitorid)][, .N, by = tn][order(tn)] %>% 
  plot_ly(x = ~tn, y = ~N) %>% 
  add_bars() %>% 
  layout(
    xaxis = list(type = "log", title = "Count of customers per number of transactions"),
    yaxis = list(type = "log")
  )

subplot(p1, p2) %>% hide_legend()

As the plot shows a vast majority of events are views. In total we have only r events[event == "transaction", .N] %>% fn transactions.

Research Question

The ultimate goal of the usecase idea, was to propose/incentivice items to a visitor based on training data and machine learning. The original question is hard to answer without any additinal research on the impact of proposal/incentive.

So we need to reformulate our goal into a question we may be able to answer using the available dataset.

Ideas

Model

Data Preperation

dates <- events[, unique(as.Date(timestamp))]
items <- events[, unique(itemid)]
visit <- events[, unique(visitorid)]

dates_items <- CJ(t = dates, itemid = items, property = c("available", "categoryid"))

property_t <- properties[order(timestamp), .SD[1], by = .(itemid, property, t = as.Date(timestamp))][dates_items, on = c("t", "itemid", "property")][order(t)][, .(t, value, value_locf = zoo::na.locf(value, na.rm = FALSE)), by = .(itemid, property)]
property_t <- dcast(property_t, t + itemid ~ property, value.var = "value_locf", fun.aggregate = function(x) x[1])

events_t <- dcast(events[, .N, by = .(t = as.Date(timestamp), visitorid, itemid, event)], t + visitorid + itemid ~ event, value.var = "N", fun.aggregate = sum)
will_buy <- events_t[order(t, decreasing = TRUE)][, ft := cumsum(transaction) > 0, by = .(visitorid)][, .(Y = as.numeric(max(ft))), by = c("t", "visitorid")]

## categoryid is not sufficient
data <- cat_tree[property_t[events_t, on = c("itemid", "t")], .(t, itemid, available, categoryid, ultimate_category, visitorid, addtocart, transaction, view), on = "categoryid"]
data_response <- will_buy[data, on = c("t", "visitorid")]

data_model <- data_response[available > 0]

data_wide <- dcast(data_model, t + visitorid + Y ~ ultimate_category, value.var = c("addtocart", "transaction", "view"), fun.aggregate = sum)
dim(data_wide)

sum(is.na(data_wide))

data_sample <- data_wide[1:10000]

Train baseline model

library(caret)

trainControl <- trainControl(
  method = "cv",
  number = 5,

)

mod_logit <- train(
  data_sample[, -(1:3)], as.factor(data_sample[[3]]),
  #data = data_wide,
  method="glm", family= binomial(), 
  metric = "Accuracy",
  preProcess = c("zv")
  ##trainControl = trainControl,
  ##verbose = TRUE
  )

mod_tree <- train(
  Y ~ ., 
  data = data_wide,
  method = "ranger",
  metric = "Accuracy",
  preProcess = c("zv"),
  trainControl = trainControl,
  verbose = TRUE
  )


mifek/bancommender documentation built on Nov. 25, 2019, 11:40 a.m.