R/fct_cohort.R

Defines functions get_cohort_user_id

Documented in get_cohort_user_id

# Sat Mar 13 14:09:57 2021 ------------------------------
#' Produce cohort, based on input conditions
#' @description Based on set of conditions defining the cohort, return user_id's
#' @import data.table
#' @importFrom lubridate %within%
#' @param interval an interval, based on lubridate definition
#' @param aws_buffer an R6 data buffer
#' @export
get_cohort_user_id <- function(interval_selected, aws_buffer) {

    #-- 1. per session - start and duration
    #-- 1.1 the first timestamp within a session
    dt <- aws_buffer$time_dt
    setkey(dt, timestamp)
    sessions_dt <- dt[, .(start_t = timestamp[1]), .(user_id, session_id)]

    #-- 1.2 is 1st user's session within a cohort interval?
    setkey(sessions_dt, start_t)
    cohort_users_dt <- sessions_dt[, .(user_start_t = start_t[1]), .(user_id)][user_start_t %within% interval_selected]

    #-- 2. strore results into a buffer
    aws_buffer$cohort_users_dt <- cohort_users_dt

    #-- 3. return
    return(cohort_users_dt)
}

#' Get all the sessions related with users from the cohort
#' @description based on subset of user_id, from function [get_cohort_user_id()], return all the sessions
#' - returns table with all relevant session_id
#' - performs calculations, like cumulative time in service per user, and per cohort
#' @import data.table
#' @importFrom lubridate duration
#' @param aws_buffer an R6 data buffer
#' @export
get_cohort_sessions <- function(aws_buffer){

    #-- 1. get sessions of cohort's users
    #-- 1.1 start with events, as we will need them to calculate metrics per session
    time_dt <- aws_buffer$time_dt
    cohort_users_dt <- aws_buffer$cohort_users_dt
    setkey(time_dt, user_id)
    setkey(cohort_users_dt, user_id)
    events_cohort_dt <- time_dt[cohort_users_dt]

    #-- 2. get sessions metrics (start, duration) from events
    setkey(events_cohort_dt, timestamp)
    cohort_sessions_dt <- events_cohort_dt[, .(events = .N, start_t = timestamp[1], session_duration = timestamp[.N] - timestamp[1]), .(session_id, user_id)]

    #-- 2.1 assign back user's first interaction to sessions
    setkey(cohort_sessions_dt, user_id)
    cohort_sessions_dt <- cohort_sessions_dt[cohort_users_dt]

    #-- 3.1 add estimated interaction time for sessions, especially critical for one-event sessions
    min_interaction_time <- 7
    add_interaction_time <- 5
    cohort_sessions_dt[, `:=` (session_duration_original = session_duration)]
    cohort_sessions_dt[session_duration == 0, `:=` (session_duration = session_duration + min_interaction_time)]
    cohort_sessions_dt[, `:=` (session_duration = session_duration + add_interaction_time)]
    #cohort_sessions_dt[, `:=` (session_duration_minutes = as.double(session_duration) %>% round() %>% lubridate::duration())]
    cohort_sessions_dt[, `:=` (session_duration_minutes = (as.double(session_duration) / 60) %>% round() )]

    #-- 3.2 time from user's 1st interaction
    cohort_sessions_dt[, `:=` (from_user_start_t = start_t - user_start_t)]

    #-- 3.3 cumulative time in service, per user
    user_sessions_durations <- cohort_sessions_dt[, .(user_sessions_duration = sum(session_duration)), .(user_id)]
    setkey(user_sessions_durations, user_id)

    #-- 3.3.1 enrich user data in buffer
    aws_buffer$cohort_users_dt <- aws_buffer$cohort_users_dt[user_sessions_durations]

    #-- 3.4 cumulative time in service, per cohort
    setkey(cohort_sessions_dt, from_user_start_t)
    cohort_sessions_dt[, `:=` (cohort_duration_cumsum = cumsum(as.double(session_duration)))]
    cohort_sessions_dt[, `:=` (from_user_start_days = as.double(from_user_start_t) / (60*60*24))]
    cohort_sessions_dt[, `:=` (cohort_duration_cumsum_minutes = cohort_duration_cumsum / 60)]

    #-- 4. store results into a buffer
    aws_buffer$cohort_sessions_dt <- cohort_sessions_dt

    return(cohort_sessions_dt)
}

#' A complete sequence to get cumulative time in service per cohort
#' @description sequence
#' - define cohort
#' - get user_id's
#' - get sessions and calculate metrics (cumulative time for now)
#' @import data.table
#' @import R6
#' @importFrom lubridate ymd ceiling_date days floor_date %--%
#' @param interval dates interval (lubridate style)
#' @param aws_buffer R6 data buffer
#' @export
get_cohort_time_cumsum <- function(interval_selected, aws_buffer) {

    #-- 1. cohort definition
    cohort_interval <- interval_selected

    #-- 3. user_id's of the cohort
    cohort_users_dt <- get_cohort_user_id(interval_selected = cohort_interval, aws_buffer = aws_buffer)

    #-- 4. get cohort sessions
    cohort_sessions_dt <- get_cohort_sessions(aws_buffer = aws_buffer)

    #-- 6. return result
    return(cohort_sessions_dt)
}

#' Plot cumulative chart
#' @description Plot line, representing cumulative time in service
# @importFrom  ggplot2 ggplot
#' @importFrom glue glue
#' @param dt data.table to plot
#' @export
plot_cumulative_line <- function(dt) {

    plot <- dt %>%
        ggplot(aes(from_user_start_days, cohort_duration_cumsum_minutes)) +
        geom_line(color = config$plot.color, size = config$line_thickness) +
        theme_bw() +
        scale_color_brewer(palette = "Set1", direction = 1) +
        # scale_fill_brewer(palette = "Set1", direction = 1) +
        theme(legend.position = "bottom") +
        #facet_wrap("Country", scales = "free_y") +
        labs(title = glue::glue("Cumulative time in service"), subtitle = glue::glue("Measuring from 1st user interaction, for all users in cohort"), y = "minutes", x = "days")
    return(plot)
}

#' Enrich sessions with company data
#' @description Function takes custom dimensions, parse, and store as variables
#' - cd3 (company): cmpany name, category and company_id
#' - cd4 (address): town/city, zip code
#' @import data.table
#' @import R6
#' @importFrom stringr str_split
#' @param aws_buffer R6 data buffer
#' @export
enrich_sessions <- function(aws_buffer){

    #-- 1. start with raw data, all events
    dt <- aws_buffer$migrated_dt

    #-- 2.1 let's have it ordered by time
    setkey(dt, timestamp)
    #-- 2.2 create index
    dt[, `:=` (n = .I)]

    #-- 3. parse fields, to get company details
    company_data_dt <- dt[,  {n;
        company_tmp = str_split(company, pattern = " : ", simplify = T); company_name    = company_tmp[,1]; company_id   = company_tmp[,2]; company_category = company_tmp[,3];
        address_tmp = str_split(address, pattern = " : ", simplify = T); company_country = address_tmp[,1]; company_town = address_tmp[,2]; company_zip      = address_tmp[,3];
        list(n, company_name, company_id, company_category, company_country, company_town, company_zip)}]

    #-- 3.1 mark missing
    company_data_dt[company_name == "", `:=` (company_name = "Missing_company_name") ]
    company_data_dt[company_id == "", `:=` (company_id = "Missing_company_id")]
    company_data_dt[company_category == "", `:=` (company_category = "Missing_company_category")]

    company_data_dt[company_country == "", `:=` (company_country = "Missing_company_country")]
    company_data_dt[company_town == "", `:=` (company_town = "Missing_company_town")]
    company_data_dt[company_zip == "", `:=` (company_zip = "Missing_company_zip")]

    #-- 4. join
    setkey(dt, n)
    setkey(company_data_dt, n)
    enriched_dt <- dt[company_data_dt]

    #-- 5. get sessions from events (the company parameters are obviously the same during a session)
    sessions_enriched_dt <- enriched_dt[, .N, .(user_id, session_id, company_id, company_country, company_town, company_zip, company_category, company_name)]

    #-- 5.1 make factors
    factor_cols <- c("company_id", "company_country")
    sessions_enriched_dt[, (factor_cols) := lapply(.SD, as.factor), .SDcols = factor_cols]

    #-- 6. store in a buffer
    aws_buffer$sessions_enriched_dt <- sessions_enriched_dt

    #-- 7. return
    return(sessions_enriched_dt)
}

#' Produce sunburst sequences
#' @description produce dataset for charting sunburst, take only a subset of sessions from the cohort
#' - business category, company name
#' - town/city, zip code, company name
#' @import data.table
#' @importFrom stringr str_replace_all
#' @param aws_buffer R6 data buffer
#' @export
produce_sunburst_sequences <- function(aws_buffer) {

    #-- 1. get two datasets
    sessions_cohort_dt   <- aws_buffer$cohort_sessions_dt
    sessions_enriched_dt <- aws_buffer$sessions_enriched_dt

    #-- 2. limit sessions to cohort only
    setkey(sessions_cohort_dt, session_id)
    setkey(sessions_enriched_dt, session_id)
    dt <- sessions_enriched_dt[sessions_cohort_dt, nomatch = 0]

    #-- 3.1 sunburst sequence for categories
    sunburst_category_cols <- c("company_category", "company_name")
    sunburst_category_orig_cols <- paste(sunburst_category_cols, "orig", sep = "_")
    dt[, (sunburst_category_orig_cols) := lapply(.SD, `[`), .SDcols = sunburst_category_cols]
    dt[, (sunburst_category_cols) := lapply(.SD, str_replace_all, pattern = "-", replacement = "_"), .SDcols = sunburst_category_cols]
    dt[, `:=` (category_seq = paste(company_category, company_name, sep = "-"))]

    #-- 3.2 sunburst sequence for geo
    sunburst_geo_cols <- c("company_town")
    sunburst_geo_orig_cols <- paste(sunburst_geo_cols, "orig", sep = "_")
    dt[, (sunburst_geo_orig_cols) := lapply(.SD, `[`), .SDcols = sunburst_geo_cols]
    dt[, (sunburst_geo_cols) := lapply(.SD, str_replace_all, pattern = "-", replacement = "_"), .SDcols = sunburst_geo_cols]
    dt[,  `:=` (geo_seq = paste(company_town, company_zip, company_name, sep = "-"))]

    aws_buffer$sunburst_sessions_dt <- dt
    data.table::fwrite(x = dt, file = "produce_sunburst_sequences.csv")

    return(dt)
}
piotrgruszecki/sw.dashboard documentation built on March 22, 2021, 2:24 a.m.