In mifek/bancommender: Recommendation system

## Load package on setup
devtools::load_all()
## Load used packages
library(magrittr)
library(knitr)
library(plotly)

## Path to locally stored csv. files
dataset_filepath <- "~/CloudStation/Projekte/r_projects/GoIT/retailrocket-recommender-system-dataset/"

## Convinience functions
fn <- function(x) prettyNum(x, big.mark = ",", decimal.mark = ".")

## data.fram convinience functions
preview_no <- 8
caption_no <- 0
caption_text <- function(str) {
  caption_no <<- caption_no + 1
  paste0("Table ", caption_no, ". ", str)
}

The data

Preperation

ds <- load_retailrocket_data(dataset_filepath)
properties <- ds$properties
category <- ds$category
events <- ds$events

Dataset loaded from Kaggle/retailrocket challange. Downloaded locally and loaded via load_retailrocket_data(filepath)command. The returned list object holds three data.framesnamely r names(ds). The following sections describe those datatables in greater detail.

rm(ds)
events[, timestamp := as.POSIXct(timestamp/1000, origin = '1970-01-01')]
properties[, timestamp := as.POSIXct(timestamp/1000, origin = '1970-01-01')]

Properties

head(properties, n = preview_no) %>% 
  kable(caption = caption_text(paste("Showing", nrow(.), "rows out of", nrow(properties) %>% fn, "from the properties datatable")))

The properties datatable assignes r properties[, length(unique(property))] %>% fn different properties to the r properties[, length(unique(itemid))] %>% fn unique items over time. All properties are hashed except of the following:

categoryid: Category of item. Categories are arranged in a hierachial structure. See Category section.
available: Number of available item in store.

The rest of the categories are useless, so lets drop them.

properties <- properties[property %chin% c("categoryid", "available")]
properties[, value := as.numeric(value)]

Category

head(category, n = preview_no) %>% 
  kable(caption = caption_text(paste("Showing", nrow(.), "rows out of", nrow(category) %>% fn, "from the category datatable")), align = c("c", "c"))

The category datatable holds hierachial category information in a flat table format mapping item-categories to parent categories over multiple levels.
Following checks are performed:

## Check if each categiory has a unique parent relation
category[, .N, by = .(categoryid)][, max(N)] == 1

## Count the number of top-level categories
category[is.na(parentid), .N]

## Count the number of lowest-level categories
category[!category, .N, on = c("categoryid" = "parentid")]

To investigate in the category hierachy we transform the category datatable to a tree structure.

cat_tree <- category[, .(categoryid, parentid = ifelse(is.na(parentid), -1, parentid))] %>% 
  data.tree::FromDataFrameNetwork() %>% 
  data.tree::ToDataFrameTable("pathString", "name") %>% 
  as.data.table %>% 
  .[, categoryPath := stringr::str_replace(pathString, "^-1/", "")] %>% 
  .[, ultimate_category := as.integer(stringr::str_extract(categoryPath, "^\\d+"))] %>% 
  .[, depth := stringr::str_count(pathString, "/")] %>% 
  .[, categoryid := as.integer(name)] %>% 
  .[, .(categoryid, ultimate_category, depth, categoryPath)]

## Maximum depth of catergory tree
(max_depth <- cat_tree[, max(depth)])

## Size of category groups
print(cat_tree[, .N, by = ultimate_category][order(-N)])

split_fill <- function(col) transpose(lapply(strsplit(col, "/"), function(v) c(v, rep(v[length(v)], max_depth - length(v)))))
cat_tree[, paste0("Level_", 1:6) := split_fill(categoryPath)]

print(cat_tree)

Events

head(events)

events[, range(timestamp)]

In the event data set we find r nrow(events) %>% fn events from r events[, length(unique(visitorid))] %>% fn unique customers.

p1 <- plot_ly(events[, .N, by = event][order(-N)], x = ~factor(event, levels = event), y = ~N) %>% 
  add_bars() %>% 
  layout(
    xaxis = list(title = "Count of event"), 
    yaxis = list(title = "")
  )

p2 <- events[event == "transaction", .(tn = .N), by = .(visitorid)][, .N, by = tn][order(tn)] %>% 
  plot_ly(x = ~tn, y = ~N) %>% 
  add_bars() %>% 
  layout(
    xaxis = list(type = "log", title = "Count of customers per number of transactions"),
    yaxis = list(type = "log")
  )

subplot(p1, p2) %>% hide_legend()

As the plot shows a vast majority of events are views. In total we have only r events[event == "transaction", .N] %>% fn transactions.

Research Question

The ultimate goal of the usecase idea, was to propose/incentivice items to a visitor based on training data and machine learning. The original question is hard to answer without any additinal research on the impact of proposal/incentive.

So we need to reformulate our goal into a question we may be able to answer using the available dataset.

Ideas

Predict based on the visitor history whether the visitor will make a transaction or not
.. this could be further done for specific categories
Based on visitors history we could cluster the visitors and propose most viewed item per cluster.

Model

Data Preperation

dates <- events[, unique(as.Date(timestamp))]
items <- events[, unique(itemid)]
visit <- events[, unique(visitorid)]

dates_items <- CJ(t = dates, itemid = items, property = c("available", "categoryid"))

property_t <- properties[order(timestamp), .SD[1], by = .(itemid, property, t = as.Date(timestamp))][dates_items, on = c("t", "itemid", "property")][order(t)][, .(t, value, value_locf = zoo::na.locf(value, na.rm = FALSE)), by = .(itemid, property)]
property_t <- dcast(property_t, t + itemid ~ property, value.var = "value_locf", fun.aggregate = function(x) x[1])

events_t <- dcast(events[, .N, by = .(t = as.Date(timestamp), visitorid, itemid, event)], t + visitorid + itemid ~ event, value.var = "N", fun.aggregate = sum)
will_buy <- events_t[order(t, decreasing = TRUE)][, ft := cumsum(transaction) > 0, by = .(visitorid)][, .(Y = as.numeric(max(ft))), by = c("t", "visitorid")]

## categoryid is not sufficient
data <- cat_tree[property_t[events_t, on = c("itemid", "t")], .(t, itemid, available, categoryid, ultimate_category, visitorid, addtocart, transaction, view), on = "categoryid"]
data_response <- will_buy[data, on = c("t", "visitorid")]

data_model <- data_response[available > 0]

data_wide <- dcast(data_model, t + visitorid + Y ~ ultimate_category, value.var = c("addtocart", "transaction", "view"), fun.aggregate = sum)
dim(data_wide)

sum(is.na(data_wide))

data_sample <- data_wide[1:10000]

Train baseline model

library(caret)

trainControl <- trainControl(
  method = "cv",
  number = 5,

)

mod_logit <- train(
  data_sample[, -(1:3)], as.factor(data_sample[[3]]),
  #data = data_wide,
  method="glm", family= binomial(), 
  metric = "Accuracy",
  preProcess = c("zv")
  ##trainControl = trainControl,
  ##verbose = TRUE
  )

mod_tree <- train(
  Y ~ ., 
  data = data_wide,
  method = "ranger",
  metric = "Accuracy",
  preProcess = c("zv"),
  trainControl = trainControl,
  verbose = TRUE
  )