## Load package on setup devtools::load_all() ## Load used packages library(magrittr) library(knitr) library(plotly) ## Path to locally stored csv. files dataset_filepath <- "~/CloudStation/Projekte/r_projects/GoIT/retailrocket-recommender-system-dataset/" ## Convinience functions fn <- function(x) prettyNum(x, big.mark = ",", decimal.mark = ".") ## data.fram convinience functions preview_no <- 8 caption_no <- 0 caption_text <- function(str) { caption_no <<- caption_no + 1 paste0("Table ", caption_no, ". ", str) }
ds <- load_retailrocket_data(dataset_filepath) properties <- ds$properties category <- ds$category events <- ds$events
Dataset loaded from Kaggle/retailrocket challange. Downloaded locally and loaded via load_retailrocket_data(filepath)
command. The returned list object holds three data.frames
namely r names(ds)
. The following sections describe those datatables in greater detail.
rm(ds) events[, timestamp := as.POSIXct(timestamp/1000, origin = '1970-01-01')] properties[, timestamp := as.POSIXct(timestamp/1000, origin = '1970-01-01')]
head(properties, n = preview_no) %>% kable(caption = caption_text(paste("Showing", nrow(.), "rows out of", nrow(properties) %>% fn, "from the properties datatable")))
The properties
datatable assignes r properties[, length(unique(property))] %>% fn
different properties to the r properties[, length(unique(itemid))] %>% fn
unique items over time. All properties are hashed except of the following:
The rest of the categories are useless, so lets drop them.
properties <- properties[property %chin% c("categoryid", "available")] properties[, value := as.numeric(value)]
head(category, n = preview_no) %>% kable(caption = caption_text(paste("Showing", nrow(.), "rows out of", nrow(category) %>% fn, "from the category datatable")), align = c("c", "c"))
The category
datatable holds hierachial category information in a flat table format mapping item-categories to parent categories over multiple levels.
Following checks are performed:
## Check if each categiory has a unique parent relation category[, .N, by = .(categoryid)][, max(N)] == 1 ## Count the number of top-level categories category[is.na(parentid), .N] ## Count the number of lowest-level categories category[!category, .N, on = c("categoryid" = "parentid")]
To investigate in the category hierachy we transform the category
datatable to a tree structure.
cat_tree <- category[, .(categoryid, parentid = ifelse(is.na(parentid), -1, parentid))] %>% data.tree::FromDataFrameNetwork() %>% data.tree::ToDataFrameTable("pathString", "name") %>% as.data.table %>% .[, categoryPath := stringr::str_replace(pathString, "^-1/", "")] %>% .[, ultimate_category := as.integer(stringr::str_extract(categoryPath, "^\\d+"))] %>% .[, depth := stringr::str_count(pathString, "/")] %>% .[, categoryid := as.integer(name)] %>% .[, .(categoryid, ultimate_category, depth, categoryPath)] ## Maximum depth of catergory tree (max_depth <- cat_tree[, max(depth)]) ## Size of category groups print(cat_tree[, .N, by = ultimate_category][order(-N)]) split_fill <- function(col) transpose(lapply(strsplit(col, "/"), function(v) c(v, rep(v[length(v)], max_depth - length(v))))) cat_tree[, paste0("Level_", 1:6) := split_fill(categoryPath)] print(cat_tree)
head(events) events[, range(timestamp)]
In the event
data set we find r nrow(events) %>% fn
events from r events[, length(unique(visitorid))] %>% fn
unique customers.
p1 <- plot_ly(events[, .N, by = event][order(-N)], x = ~factor(event, levels = event), y = ~N) %>% add_bars() %>% layout( xaxis = list(title = "Count of event"), yaxis = list(title = "") ) p2 <- events[event == "transaction", .(tn = .N), by = .(visitorid)][, .N, by = tn][order(tn)] %>% plot_ly(x = ~tn, y = ~N) %>% add_bars() %>% layout( xaxis = list(type = "log", title = "Count of customers per number of transactions"), yaxis = list(type = "log") ) subplot(p1, p2) %>% hide_legend()
As the plot shows a vast majority of events are views. In total we have only r events[event == "transaction", .N] %>% fn
transactions.
The ultimate goal of the usecase idea, was to propose/incentivice items to a visitor based on training data and machine learning. The original question is hard to answer without any additinal research on the impact of proposal/incentive.
So we need to reformulate our goal into a question we may be able to answer using the available dataset.
dates <- events[, unique(as.Date(timestamp))] items <- events[, unique(itemid)] visit <- events[, unique(visitorid)] dates_items <- CJ(t = dates, itemid = items, property = c("available", "categoryid")) property_t <- properties[order(timestamp), .SD[1], by = .(itemid, property, t = as.Date(timestamp))][dates_items, on = c("t", "itemid", "property")][order(t)][, .(t, value, value_locf = zoo::na.locf(value, na.rm = FALSE)), by = .(itemid, property)] property_t <- dcast(property_t, t + itemid ~ property, value.var = "value_locf", fun.aggregate = function(x) x[1]) events_t <- dcast(events[, .N, by = .(t = as.Date(timestamp), visitorid, itemid, event)], t + visitorid + itemid ~ event, value.var = "N", fun.aggregate = sum) will_buy <- events_t[order(t, decreasing = TRUE)][, ft := cumsum(transaction) > 0, by = .(visitorid)][, .(Y = as.numeric(max(ft))), by = c("t", "visitorid")] ## categoryid is not sufficient data <- cat_tree[property_t[events_t, on = c("itemid", "t")], .(t, itemid, available, categoryid, ultimate_category, visitorid, addtocart, transaction, view), on = "categoryid"] data_response <- will_buy[data, on = c("t", "visitorid")] data_model <- data_response[available > 0] data_wide <- dcast(data_model, t + visitorid + Y ~ ultimate_category, value.var = c("addtocart", "transaction", "view"), fun.aggregate = sum) dim(data_wide) sum(is.na(data_wide)) data_sample <- data_wide[1:10000]
library(caret) trainControl <- trainControl( method = "cv", number = 5, ) mod_logit <- train( data_sample[, -(1:3)], as.factor(data_sample[[3]]), #data = data_wide, method="glm", family= binomial(), metric = "Accuracy", preProcess = c("zv") ##trainControl = trainControl, ##verbose = TRUE ) mod_tree <- train( Y ~ ., data = data_wide, method = "ranger", metric = "Accuracy", preProcess = c("zv"), trainControl = trainControl, verbose = TRUE )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.