#-- 0.0 markdown options knitr::opts_chunk$set( echo = FALSE, eval = TRUE, message = FALSE, warning = FALSE, dpi = 300, tinytex.verbose = TRUE, tidy = FALSE, # cashe.extra = packageVersion('tufte'), fig.align = "center" )
#-- 1.0 libraries ---- # library(magrittr) # library(tidyverse) library(data.table) library(rlist) # library(tidyquant) # library(RMySQL) library(lubridate) # library(ISOweek) # library(MMWRweek) #-- 1.2 Libraries own ---- library(sw.dashboard) #-- 1.3 libraries graphical ---- library(ggpubr) library(knitr) library(ggExtra) library(viridis) library(tufte)
#-- 1.3. Settings ---- Sys.setenv(R_CONFIG_ACTIVE = "default") # use "default" for for production, "development" for prototyping config <- config::get(file = "../inst/golem-config.yml", use_parent = TRUE)
r newthought('The goal')
of this document is to prepare functions for the report Sunburst.
dates_interval <- "2020-12-01" %--% "2021-03-12" x <- sw.dashboard::pull_sunburst_data(dates_interval) dt <- x$sellwerk_3_of_5 dt %>% glimpse()
dt[, .N, .(dimension7)] time_cols <- c("dimension15") dt[, (time_cols) := lapply(.SD, ymd_hms), .SDcols = time_cols] dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7)][, .(delta = as.numeric(V1))][delta< 5000, delta] %>% hist() dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][, .(delta = as.numeric(V1))][delta< 5000, delta] %>% hist() dt[, .N, .(dimension4)][, .(N, str_split(dimension4, pattern = " : ", simplify = T))][, .(sum(N)), .(V3)][order(-V1)] dt[, .N, .(dimension3)][, .(N, str_split(dimension3, pattern = " : ", simplify = T))][, .(sum(N)), .(V3)][order(-V1)] dt[, .N, .(dimension7, dimension20)][, .N, .(dimension20)] dt[dimension20 == "true]", .(range(dimension15)), .(dimension7)] #-- 1st interaction dt[, .(min(dimension15)), .(dimension7)] dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][, .(fivenum(V1))] dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][, .(sum(V1), .(dimension7))] dimension7_3sessions <- dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][, .N, .(dimension7)][order(-N)][N == 3][, dimension7] dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][dimension7 == "5e8cdf87-6c6e-4699-b0cf-cb3ab7dcc38a"][, .(sum(V1))] dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][dimension7 == "5e8cdf87-6c6e-4699-b0cf-cb3ab7dcc38a"][, .(sum(V1)), .(dimension7)] dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][dimension7 %in% dimension7_3sessions[1:5]][, .(sum(V1)), .(dimension7)] dt[, .(range(dimension15)[2] - range(dimension15)[1]), .(dimension7, dimension16)][][, .(sum(V1)), .(dimension7)]
# Sat Mar 13 14:04:17 2021 ------------------------------ #-- get time metrics per session, user #-- 1. select subset of columns time_dt <- dt[, .(user_id = dimension7, session_id = dimension16, timestamp = dimension15)] #-- 2. per session - start and duration #-- 2.1 number of events per session, and time range within a session setkey(time_dt, timestamp) sessions_dt <- time_dt[, .(events = .N, start_t = timestamp[1], session_duration = range(timestamp)[2] - range(timestamp)[1]), .(user_id, session_id)] #-- 2.2 define cohort by start time zero_t <- lubridate::ymd("2021-02-01") # this is what we'll get from shiny selector cohort_end_t <- lubridate::ceiling_date(zero_t, unit = "months") - days(1) cohort_start_t <- lubridate::floor_date(zero_t, unit = "month") cohort_t <- cohort_start_t %--% cohort_end_t cohort_t %>% typeof() #-- 2.3 trim dataset to cohort setkey(sessions_dt, start_t) sessions_dt %>% key() cohort_users_dt <- sessions_dt[, .(user_start_t = start_t[1]), .(user_id)][user_start_t %within% cohort_t] setkey(sessions_dt, user_id) setkey(cohort_users_dt, user_id) sessions_cohort_dt <- sessions_dt[cohort_users_dt] # sessions_cohort_dt[, .(range(start_t)[2]- range(start_t)[1]), .(user_id)] # sessions_cohort_dt[, .N, .(user_id)][N > 1] # sessions_cohort_dt[user_id == "02d84af9-23d5-4872-80a1-435344b759c1"] #-- 2.4 add estimated interaction time for sessions, especially critical for one-event sessions min_interaction_time <- 7 add_interaction_time <- 5 sessions_cohort_dt[, `:=` (session_duration_original = session_duration)] sessions_cohort_dt[session_duration == 0, `:=` (session_duration = session_duration + min_interaction_time)] sessions_cohort_dt[, `:=` (session_duration = session_duration + add_interaction_time)] #-- 2.4.1 check session_one_event_dt <- sessions_cohort_dt[events == 1, ] setkey(session_one_event_dt, session_id) setkey(time_dt, session_id) time_dt[session_one_event_dt][, .N, .(session_id)][N > 1] #-- 2.5 time from user's 1st interaction sessions_cohort_dt[, .(session_id, events, start_t, user_start_t, start_t - user_start_t), .(user_id)][V5 > 0] #sessions_cohort_dt[user_id == "02d84af9-23d5-4872-80a1-435344b759c1"] sessions_cohort_dt[, `:=` (from_user_start_t = start_t - user_start_t)] #-- 2.6 cumulative time in service, per user setkey(sessions_cohort_dt, start_t) sessions_cohort_dt[, `:=` (session_duration_cumsum = cumsum(as.double(session_duration))), .(user_id)] sessions_cohort_dt[user_id == "02d84af9-23d5-4872-80a1-435344b759c1"] sessions_cohort_dt[user_id == "96dbaba1-0636-4669-8001-9184da615e66"] user_id_sorted <- sessions_cohort_dt[, .N, .(user_id)][order(-N)] sessions_cohort_dt[user_id == "f02fa972-645e-4fa6-9085-d75c917dc5d3"] sessions_cohort_dt[user_id %in% user_id_sorted[1, user_id]] setkey(sessions_cohort_dt, from_user_start_t) sessions_cohort_dt[, `:=` (cohort_duration_cumsum = cumsum(session_duration_cumsum))] sessions_cohort_dt[, `:=` (from_user_start_days = as.double(from_user_start_t) / (60*60*24))] sessions_cohort_dt[, `:=` (cohort_duration_cumsum_minutes = as.double(cohort_duration_cumsum) / (60))] sessions_cohort_dt %>% ggplot(aes(from_user_start_days, cohort_duration_cumsum_minutes)) + geom_line()
#-- data pull dates_interval <- "2020-12-01" %--% "2021-03-12" x <- sw.dashboard::pull_sunburst_data(dates_interval) dt <- x$sellwerk_3_of_5 dt %>% glimpse() #-- column types time_cols <- c("dimension15") dt[, (time_cols) := lapply(.SD, ymd_hms), .SDcols = time_cols] factor_cols <- c("dimension2", "dimension7", "dimension16", "dimension20", "segment") dt[, (factor_cols) := lapply(.SD, as.factor), .SDcols = factor_cols] setnames(dt, c("dimension2", "dimension3", "dimension4", "dimension7", "dimension15", "dimension16", "dimension20"), c("user_logged", "company", "address", "user_id", "timestamp", "session_id", "migrated")) remove_cols <- c("ga_view", "segment", "query") dt[, (remove_cols) := NULL] data_file <- c("../data/dt.Rds") saveRDS(object = dt, file = "../data/dt.Rds")
data_file <- c("../data/dt.Rds") dt <- readRDS(data_file) library(DataExplorer) DataExplorer::introduce(dt) #-- get cumulative time per cohort output_sessions_dt <- sw.dashboard::get_cohort_time_cumsum(dt) output_sessions_dt %>% glimpse() output_sessions_dt %>% ggplot(aes(from_user_start_days, cohort_duration_cumsum_minutes)) + geom_line(color = config$plot.color) + theme_bw() + scale_color_brewer(palette = "Set1", direction = 1) + # scale_fill_brewer(palette = "Set1", direction = 1) + theme(legend.position = "bottom") + #facet_wrap("Country", scales = "free_y") + labs(title = glue::glue("Cumulative time in service"), subtitle = glue::glue("Measuring from 1st user interaction, for all users in cohort"), y = "minutes", x = "days") plot <- output_sessions_dt[, .(from_user_start_days, cohort_duration_cumsum_minutes)] %>% plot_cumulative_line()
# Sun Mar 14 07:40:58 2021 ------------------------------ # preparing data to plot sunburst chart dt %>% glimpse() #-- let's have it ordered by time setkey(dt, timestamp) #-- create index dt[, `:=` (n = .I)] #--- parse fields, to get company details company_data_dt <- dt[, {n; company_tmp = str_split(company, pattern = " : ", simplify = T); company_name = company_tmp[,1]; company_id = company_tmp[,2]; company_category = company_tmp[,3]; address_tmp = str_split(address, pattern = " : ", simplify = T); company_country = address_tmp[,1]; company_town = address_tmp[,2]; company_zip = address_tmp[,3]; list(n, company_name, company_id, company_category, company_country, company_town, company_zip)}] # multiple comany names per comapny_id company_data_dt[, .N, .(company_name, company_id, company_category, company_country, company_town, company_zip)][, .N, .(company_id)][N > 1] setkey(dt, n) setkey(company_data_dt, n) enriched_dt <- dt[company_data_dt] sessions_enriched_dt <- enriched_dt[, .N, .(user_id, session_id, company_id, company_country, company_town, company_zip, company_category, company_name)] #-- produce sequences factor_cols <- c("company_id", "company_country") sessions_enriched_dt[, (factor_cols) := lapply(.SD, as.factor), .SDcols = factor_cols] #-- sunburst - categories sunburst_category_cols <- c("company_category", "company_name") sunburst_category_orig_cols <- paste(sunburst_category_cols, "orig", sep = "_") sessions_enriched_dt[, (sunburst_category_orig_cols) := lapply(.SD, `[`), .SDcols = sunburst_category_cols] sessions_enriched_dt[, (sunburst_category_cols) := lapply(.SD, str_replace_all, pattern = "-", replacement = "_"), .SDcols = sunburst_category_cols] sessions_enriched_dt[, `:=` (category_seq = paste(company_category, company_name, sep = "-"))] sessions_enriched_dt[, .N, .(category_seq)] %>% sunburstR::sund2b() #-- sunburst - geo's sunburst_geo_cols <- c("company_town") sunburst_geo_orig_cols <- paste(sunburst_geo_cols, "orig", sep = "_") sessions_enriched_dt[, (sunburst_geo_orig_cols) := lapply(.SD, `[`), .SDcols = sunburst_geo_cols] sessions_enriched_dt[, (sunburst_geo_cols) := lapply(.SD, str_replace_all, pattern = "-", replacement = "_"), .SDcols = sunburst_geo_cols] sessions_enriched_dt[, `:=` (geo_seq = paste(company_town, company_zip, company_name, sep = "-"))] sessions_enriched_dt[, .N, .(geo_seq)] %>% sunburstR::sund2b() sessions_enriched_dt[str_detect(category_seq, pattern = "Kleemann_IT")] %>% glimpse() sessions_enriched_dt[str_detect(company_town, pattern = "^ ")] sessions_enriched_dt[, .(company_town, str_squish(company_town))][str_detect(company_town, pattern = "Köln")] %>% glimpse()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.