# Sat Mar 13 14:09:57 2021 ------------------------------
#' Produce cohort, based on input conditions
#' @description Based on set of conditions defining the cohort, return user_id's
#' @import data.table
#' @importFrom lubridate %within%
#' @param interval an interval, based on lubridate definition
#' @param aws_buffer an R6 data buffer
#' @export
get_cohort_user_id <- function(interval_selected, aws_buffer) {
#-- 1. per session - start and duration
#-- 1.1 the first timestamp within a session
dt <- aws_buffer$time_dt
setkey(dt, timestamp)
sessions_dt <- dt[, .(start_t = timestamp[1]), .(user_id, session_id)]
#-- 1.2 is 1st user's session within a cohort interval?
setkey(sessions_dt, start_t)
cohort_users_dt <- sessions_dt[, .(user_start_t = start_t[1]), .(user_id)][user_start_t %within% interval_selected]
#-- 2. strore results into a buffer
aws_buffer$cohort_users_dt <- cohort_users_dt
#-- 3. return
return(cohort_users_dt)
}
#' Get all the sessions related with users from the cohort
#' @description based on subset of user_id, from function [get_cohort_user_id()], return all the sessions
#' - returns table with all relevant session_id
#' - performs calculations, like cumulative time in service per user, and per cohort
#' @import data.table
#' @importFrom lubridate duration
#' @param aws_buffer an R6 data buffer
#' @export
get_cohort_sessions <- function(aws_buffer){
#-- 1. get sessions of cohort's users
#-- 1.1 start with events, as we will need them to calculate metrics per session
time_dt <- aws_buffer$time_dt
cohort_users_dt <- aws_buffer$cohort_users_dt
setkey(time_dt, user_id)
setkey(cohort_users_dt, user_id)
events_cohort_dt <- time_dt[cohort_users_dt]
#-- 2. get sessions metrics (start, duration) from events
setkey(events_cohort_dt, timestamp)
cohort_sessions_dt <- events_cohort_dt[, .(events = .N, start_t = timestamp[1], session_duration = timestamp[.N] - timestamp[1]), .(session_id, user_id)]
#-- 2.1 assign back user's first interaction to sessions
setkey(cohort_sessions_dt, user_id)
cohort_sessions_dt <- cohort_sessions_dt[cohort_users_dt]
#-- 3.1 add estimated interaction time for sessions, especially critical for one-event sessions
min_interaction_time <- 7
add_interaction_time <- 5
cohort_sessions_dt[, `:=` (session_duration_original = session_duration)]
cohort_sessions_dt[session_duration == 0, `:=` (session_duration = session_duration + min_interaction_time)]
cohort_sessions_dt[, `:=` (session_duration = session_duration + add_interaction_time)]
#cohort_sessions_dt[, `:=` (session_duration_minutes = as.double(session_duration) %>% round() %>% lubridate::duration())]
cohort_sessions_dt[, `:=` (session_duration_minutes = (as.double(session_duration) / 60) %>% round() )]
#-- 3.2 time from user's 1st interaction
cohort_sessions_dt[, `:=` (from_user_start_t = start_t - user_start_t)]
#-- 3.3 cumulative time in service, per user
user_sessions_durations <- cohort_sessions_dt[, .(user_sessions_duration = sum(session_duration)), .(user_id)]
setkey(user_sessions_durations, user_id)
#-- 3.3.1 enrich user data in buffer
aws_buffer$cohort_users_dt <- aws_buffer$cohort_users_dt[user_sessions_durations]
#-- 3.4 cumulative time in service, per cohort
setkey(cohort_sessions_dt, from_user_start_t)
cohort_sessions_dt[, `:=` (cohort_duration_cumsum = cumsum(as.double(session_duration)))]
cohort_sessions_dt[, `:=` (from_user_start_days = as.double(from_user_start_t) / (60*60*24))]
cohort_sessions_dt[, `:=` (cohort_duration_cumsum_minutes = cohort_duration_cumsum / 60)]
#-- 4. store results into a buffer
aws_buffer$cohort_sessions_dt <- cohort_sessions_dt
return(cohort_sessions_dt)
}
#' A complete sequence to get cumulative time in service per cohort
#' @description sequence
#' - define cohort
#' - get user_id's
#' - get sessions and calculate metrics (cumulative time for now)
#' @import data.table
#' @import R6
#' @importFrom lubridate ymd ceiling_date days floor_date %--%
#' @param interval dates interval (lubridate style)
#' @param aws_buffer R6 data buffer
#' @export
get_cohort_time_cumsum <- function(interval_selected, aws_buffer) {
#-- 1. cohort definition
cohort_interval <- interval_selected
#-- 3. user_id's of the cohort
cohort_users_dt <- get_cohort_user_id(interval_selected = cohort_interval, aws_buffer = aws_buffer)
#-- 4. get cohort sessions
cohort_sessions_dt <- get_cohort_sessions(aws_buffer = aws_buffer)
#-- 6. return result
return(cohort_sessions_dt)
}
#' Plot cumulative chart
#' @description Plot line, representing cumulative time in service
# @importFrom ggplot2 ggplot
#' @importFrom glue glue
#' @param dt data.table to plot
#' @export
plot_cumulative_line <- function(dt) {
plot <- dt %>%
ggplot(aes(from_user_start_days, cohort_duration_cumsum_minutes)) +
geom_line(color = config$plot.color, size = config$line_thickness) +
theme_bw() +
scale_color_brewer(palette = "Set1", direction = 1) +
# scale_fill_brewer(palette = "Set1", direction = 1) +
theme(legend.position = "bottom") +
#facet_wrap("Country", scales = "free_y") +
labs(title = glue::glue("Cumulative time in service"), subtitle = glue::glue("Measuring from 1st user interaction, for all users in cohort"), y = "minutes", x = "days")
return(plot)
}
#' Enrich sessions with company data
#' @description Function takes custom dimensions, parse, and store as variables
#' - cd3 (company): cmpany name, category and company_id
#' - cd4 (address): town/city, zip code
#' @import data.table
#' @import R6
#' @importFrom stringr str_split
#' @param aws_buffer R6 data buffer
#' @export
enrich_sessions <- function(aws_buffer){
#-- 1. start with raw data, all events
dt <- aws_buffer$migrated_dt
#-- 2.1 let's have it ordered by time
setkey(dt, timestamp)
#-- 2.2 create index
dt[, `:=` (n = .I)]
#-- 3. parse fields, to get company details
company_data_dt <- dt[, {n;
company_tmp = str_split(company, pattern = " : ", simplify = T); company_name = company_tmp[,1]; company_id = company_tmp[,2]; company_category = company_tmp[,3];
address_tmp = str_split(address, pattern = " : ", simplify = T); company_country = address_tmp[,1]; company_town = address_tmp[,2]; company_zip = address_tmp[,3];
list(n, company_name, company_id, company_category, company_country, company_town, company_zip)}]
#-- 3.1 mark missing
company_data_dt[company_name == "", `:=` (company_name = "Missing_company_name") ]
company_data_dt[company_id == "", `:=` (company_id = "Missing_company_id")]
company_data_dt[company_category == "", `:=` (company_category = "Missing_company_category")]
company_data_dt[company_country == "", `:=` (company_country = "Missing_company_country")]
company_data_dt[company_town == "", `:=` (company_town = "Missing_company_town")]
company_data_dt[company_zip == "", `:=` (company_zip = "Missing_company_zip")]
#-- 4. join
setkey(dt, n)
setkey(company_data_dt, n)
enriched_dt <- dt[company_data_dt]
#-- 5. get sessions from events (the company parameters are obviously the same during a session)
sessions_enriched_dt <- enriched_dt[, .N, .(user_id, session_id, company_id, company_country, company_town, company_zip, company_category, company_name)]
#-- 5.1 make factors
factor_cols <- c("company_id", "company_country")
sessions_enriched_dt[, (factor_cols) := lapply(.SD, as.factor), .SDcols = factor_cols]
#-- 6. store in a buffer
aws_buffer$sessions_enriched_dt <- sessions_enriched_dt
#-- 7. return
return(sessions_enriched_dt)
}
#' Produce sunburst sequences
#' @description produce dataset for charting sunburst, take only a subset of sessions from the cohort
#' - business category, company name
#' - town/city, zip code, company name
#' @import data.table
#' @importFrom stringr str_replace_all
#' @param aws_buffer R6 data buffer
#' @export
produce_sunburst_sequences <- function(aws_buffer) {
#-- 1. get two datasets
sessions_cohort_dt <- aws_buffer$cohort_sessions_dt
sessions_enriched_dt <- aws_buffer$sessions_enriched_dt
#-- 2. limit sessions to cohort only
setkey(sessions_cohort_dt, session_id)
setkey(sessions_enriched_dt, session_id)
dt <- sessions_enriched_dt[sessions_cohort_dt, nomatch = 0]
#-- 3.1 sunburst sequence for categories
sunburst_category_cols <- c("company_category", "company_name")
sunburst_category_orig_cols <- paste(sunburst_category_cols, "orig", sep = "_")
dt[, (sunburst_category_orig_cols) := lapply(.SD, `[`), .SDcols = sunburst_category_cols]
dt[, (sunburst_category_cols) := lapply(.SD, str_replace_all, pattern = "-", replacement = "_"), .SDcols = sunburst_category_cols]
dt[, `:=` (category_seq = paste(company_category, company_name, sep = "-"))]
#-- 3.2 sunburst sequence for geo
sunburst_geo_cols <- c("company_town")
sunburst_geo_orig_cols <- paste(sunburst_geo_cols, "orig", sep = "_")
dt[, (sunburst_geo_orig_cols) := lapply(.SD, `[`), .SDcols = sunburst_geo_cols]
dt[, (sunburst_geo_cols) := lapply(.SD, str_replace_all, pattern = "-", replacement = "_"), .SDcols = sunburst_geo_cols]
dt[, `:=` (geo_seq = paste(company_town, company_zip, company_name, sep = "-"))]
aws_buffer$sunburst_sessions_dt <- dt
data.table::fwrite(x = dt, file = "produce_sunburst_sequences.csv")
return(dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.