#-- 0.0 markdown options
knitr::opts_chunk$set(
    echo = FALSE,
    eval = TRUE,
    message = FALSE,
    warning = FALSE,
    dpi = 300,
    tinytex.verbose = TRUE,
    tidy = FALSE, 
#    cashe.extra = packageVersion('tufte'),
    fig.align = "center"
    )
#-- 1.0 libraries ----
# library(magrittr)
# library(tidyverse)
library(data.table)
# library(tidyquant)
# library(RMySQL)
library(lubridate)
# library(ISOweek)
# library(MMWRweek)

#-- 1.2 Libraries own ----
library(fd.dashboard)

#-- 1.3 libraries graphical ----
library(ggpubr)
library(knitr)
library(ggExtra)
library(viridis)
library(tufte)
#-- 1.3. Settings ----
Sys.setenv(R_CONFIG_ACTIVE = "default") # use "default" for for production, "development" for prototyping
config <- config::get(file = "../inst/golem-config.yml", use_parent = TRUE)

Purpose

r newthought('The goal') of this document is to prepare functions for the report 14 Investment Level.

prof_raw_dt <- fd.dashboard::get_clean_profiles()
prof_raw_dt %>% str()

lead_raw_dt <- fd.dashboard::get_clean_leads()
lead_raw_dt %>% str()
prof_dt[, .N, .(min_investment, min_investment_currency)]

prof_dt[status == "Active", .(min_investment, min_investment_currency)][, .N, .(min_investment)][order(-N)]

prof_usd_dt <- prof_dt[status == "Active" & min_investment_currency == "USD", .(min_investment)]


min_investment_max = 1e6
prof_usd_dt <- prof_usd_dt[min_investment <= min_investment_max, ]
prof_usd_dt[, `:=` (min_investment = min_investment / 1e3)]
prof_usd_dt[order(min_investment)]

numbers_of_bins = 9
prof_usd_dt[, `:=` (bins_q = cut(min_investment, breaks = unique(quantile(min_investment, probs = seq.int(0, 1, by = 1/numbers_of_bins))), include.lowest = TRUE))]

prof_usd_dt[, .N, .(bins_q)][order(bins_q)]

prof_usd_dt[, .N, .(bins_q)][order(bins_q)] %>%
  ggplot(aes(bins_q, N)) +
  geom_col()

#-- regular bins
numbers_of_bins = 9
bin_from = 70
bin_to   = 160
bin_by   = 10
prof_usd_dt[, `:=` (bins = cut(min_investment, breaks = c(-Inf, seq.int(from = bin_from, to = bin_to, by = bin_by), Inf), include.lowest = TRUE) )]

prof_usd_dt[, .N, .(bins)][order(bins)] %>%
  ggplot(aes(bins, N)) +
  geom_col() +
  theme_bw()

#-- all currencies
min_investment_max_dt <- data.table(min_investment_max      = c(10,    10,    100,   100,   100,   100,   10), # in millions
                                    min_investment_currency = c("USD", "EUR", "ZAR", "MXN", "MYR", "SGD", "GBP"))
min_investment_max_dt[, `:=` (min_investment_max = min_investment_max * 1e6)]

setkey(prof_dt, min_investment_currency)
setkey(min_investment_max_dt, min_investment_currency)

prof_dt <- prof_dt[min_investment_max_dt]

prof_dt[, `:=` (outlier = fcase(min_investment >  min_investment_max, TRUE, 
                                min_investment <= min_investment_max, FALSE))]

prof_dt[, `:=` (min_investment = min_investment / 1e3)]
prof_dt[, .N, .(outlier, min_investment_currency, status)][, dcast.data.table(.SD, min_investment_currency ~ status + outlier)]
prof_dt[outlier == T & min_investment_currency == "MXN" & status == "Active", ]

prof_dt[, .N, .(status)]
prof_dt[(!outlier), ]

#-- chart
prof_normal_dt <- prof_dt[!(outlier), ]

numbers_of_bins = 9
bin_from = 10
bin_to   = 100
bin_by   = 20

prof_normal_dt[, bins := NULL]
prof_normal_dt[, `:=` (bins = cut(min_investment, breaks = c(-Inf, seq.int(from = bin_from, to = bin_to, by = bin_by), Inf), include.lowest = TRUE)), by = .(min_investment_currency)]
prof_normal_dt[, .N, .(bins, min_investment_currency)][min_investment_currency %in% c("USD"), ] %>% str()

prof_normal_dt[min_investment_currency %in% c("EUR", "USD", "ZAR", "MYR", "GBP"), .N, .(bins, min_investment_currency)][order(bins)] %>%
  ggplot(aes(bins, N)) +
  geom_col(color = config$plot.color, fill = config$plot.color, alpha = 0.4) +
  theme_bw() +
  facet_wrap("min_investment_currency")

prof_dt[(outlier) & status == "Active", .(profile, sales_rep, website_iso2c, min_investment, min_investment_currency, outlier)]
prof_dt[, .N, .(is.na(min_investment), is.na(min_investment_currency))]

prof_normal_dt[min_investment < 40e3 & min_investment_currency == "ZAR", min_investment] %>%
  hist()
#-- red profiles
prof_dt <- fd.dashboard::get_clean_profiles()

#-- identify outliers
min_investment_max_dt <- data.table(min_investment_max      = c(10,    10,    10,    100,   100,   100,   100), # in millions
                                    min_investment_currency = c("USD", "EUR", "GBP", "ZAR", "MXN", "MYR", "SGD"))
#min_investment_max_dt[, `:=` (min_investment_max = min_investment_max * 1e6)]

#-- threshold to define outliers
tab_01 <- min_investment_max_dt[order(min_investment_max, min_investment_currency)]

setkey(prof_dt, min_investment_currency)
setkey(min_investment_max_dt, min_investment_currency)

prof_min_set_dt <- prof_dt[min_investment_max_dt]

prof_min_set_dt[, `:=` (outlier = fcase(min_investment >  min_investment_max * 1e6, TRUE, 
                                        min_investment <= min_investment_max * 1e6, FALSE))]
prof_min_set_dt[, `:=` (min_investment = min_investment / 1e3)]

prof_min_set_dt[status == "Active", .N, .(profile_id, min_investment, min_investment_currency)]
prof_min_set_dt[, .N, .(profile_id, min_investment, min_investment_currency, status)][, .N, .(status)]

setkey(prof_min_set_dt, tech_date_start)
prof_min_set_dt[, .SD[.N], .(profile_id)][, .N, .(profile_id, min_investment, min_investment_currency, status)][status == "Active"]

#-- number of outliers, per currency and status
table_02 <- prof_min_set_dt[, .N, .(outlier, min_investment_currency, status, min_investment_max)][, dcast.data.table(.SD, min_investment_currency + min_investment_max ~ status + outlier)][order(-Active_FALSE)]

#-- undefined min_investment OR min_investment_currency
table_03 <- prof_dt[status == "Active", .N, .(m_inv = is.na(min_investment), m_inv_c = is.na(min_investment_currency), sales_rep)][(m_inv) | (m_inv_c), .(sales_rep, N)][order(-N)]

table_04 <- prof_dt[status == "Active", .N, .(m_inv = is.na(min_investment), m_inv_c = is.na(min_investment_currency), sales_rep)][, `:=` (missing_min_inv = fcase(
  (m_inv == T | m_inv_c == T), TRUE, 
  (m_inv == F & m_inv_c == F), FALSE))][, .(sales_rep, missing_min_inv, N)][, dcast.data.table(.SD, sales_rep ~ missing_min_inv, fill = 0)]
setnames(table_04, c("FALSE", "TRUE"), c("OK", "missing"))

table_04_sum <- table_04[, lapply(.SD, sum), .SDcols = is.numeric][, `:=` (sales_rep = "Total")]

data.table::rbindlist(list(table_04, table_04_sum), use.names = T)


#-- charting
prof_normal_dt <- prof_min_set_dt[!(outlier), ]

bin_from = 10
bin_to   = 110
bin_by   = 25
currencies <- min_investment_max_dt[, min_investment_currency]

prof_normal_dt[, bins := NULL]
prof_normal_dt[, `:=` (bins = cut(min_investment, breaks = c(-Inf, seq.int(from = bin_from, to = bin_to, by = bin_by), Inf), include.lowest = TRUE)), by = .(min_investment_currency)]
prof_normal_dt[, .N, .(bins, min_investment_currency)][min_investment_currency %in% currencies, ] %>% str()

prof_normal_2_dt <- prof_normal_dt[, `:=` (n = 1)][min_investment_currency %in% currencies, ][, `:=` (min_investment_currency = as.factor(min_investment_currency) %>% fct_reorder(.f = min_investment_currency, .x = n, .fun = sum) %>% fct_rev())][]
#prof_normal_2_dt %>% str()
prof_normal_2_dt$min_investment_currency %>% levels()
currencies_sort <- prof_normal_2_dt[, .N, .(min_investment_currency)][order(-N)][, min_investment_currency]

prof_normal_2_dt[min_investment_currency %in% currencies_sort[1:3], .N, .(bins, min_investment_currency)][order(bins)] %>%
  ggplot(aes(bins, N)) +
  geom_col(color = config$plot.color, fill = config$plot.color, alpha = 0.4) +
  theme_bw() +
  facet_wrap("min_investment_currency", scales = "free_y") +
  labs(title = "Number of Profiles, binned, per currency.", subtitle = "Only Active Profiles.", x = "", y = "")
#-- reduce changes to only those, having changed status or min_investment, "flatten" all other changes, when status is not changed
dt <- prof_normal_2_dt
setkey(dt, tech_date_start, profile_id)
dt[, `:=` (status_prev = shift(status, type = "lag", fill = NA), min_investment_prev = shift(min_investment, type = "lag", fill = NA)), .(profile_id)]
dt[profile_id %in% profile_id_mult_inv[1], .(status, status_prev, min_investment, min_investment_prev, tech_date_start)]
prof_dt[profile_id %in% profile_id_mult_inv[1], ]

x3_dt <- dt[, .(profile_id, status, status_prev, tech_date_start, min_investment, min_investment_prev)][status != status_prev | is.na(status_prev) | min_investment != min_investment_prev,]
x3_dt[profile_id %in% profile_id_mult_inv[2], ]
x3_dt[status == "Active"][profile_id %in% profile_id_mult_inv[2], ] 

profile_id_mult_2_inv <- x3_dt[, .N, .(profile_id, min_investment)][, .N, .(profile_id)][N > 1][, profile_id]
x3_dt[profile_id %in% profile_id_mult_2_inv[1], ] 

profile_id_mult_2_inv[1:2]
#-- 
# uwzględniać leady jednokrotnie - spłaszczyć liczbę profili - tylko zmiana statusu, lub min_investment
# 
profile_id_mult_inv <- prof_normal_2_dt[, .N, .(profile_id, min_investment)][, .N, .(profile_id)][N > 1][order(-N)][, profile_id]
prof_normal_2_dt[profile_id %in% profile_id_mult_inv[15], ]

profile_id_mult <- prof_normal_2_dt[status == "Active", .N, .(profile_id)][order(-N)][1, profile_id]
prof_normal_2_dt[profile_id %in% profile_id_mult, ]

prof_normal_2_dt[, .(profile, profile_id, status, tech_date_start, tech_date_end, tech_prev_id, tech_next_id, min_investment, min_investment_currency)]

#leads_dt <- fread("../../franchisedirect/dashboards/report_14/bs_leads_clean_view.csv")
#leads_dt <- fread("../../franchisedirect/dashboards/report_14/query_result.csv")
setkey(leads_dt, profile_id, date)
leads_dt[, .N, .(profile_id)]

prof_dt <- x3_dt[profile_id %in% profile_id_mult_2_inv[2], ] 
setkey(prof_dt, profile_id, tech_date_start)


x4_dt <- prof_dt[leads_dt, roll = T][, .(profile_id, status, tech_date_start, tech_date_end, min_investment)][tech_date_start >= as.Date("2020-12-16")]
x4_dt[, .N, .(profile_id)]

x4_dt[, .N, .(date = tech_date_start, min_investment)]
x4_dt[, .N, .( min_investment)]

rm(x4_dt)
#-- reduce changes to only those, having changed status or min_investment, "flatten" all other changes, when status is not changed
dt <- prof_raw_dt
setkey(dt, tech_date_start, profile_id)
dt[, `:=` (status_prev = shift(status, type = "lag", fill = NA), min_investment_prev = shift(min_investment, type = "lag", fill = NA)), .(profile_id)]
dt[profile_id %in% profile_id_mult_inv[1], .(status, status_prev, min_investment, min_investment_prev, tech_date_start)]
dt[profile_id %in% profile_x_id, .(profile_id, status, status_prev, tech_date_start, min_investment, min_investment_prev)]

x3_dt <- dt[, .(profile_id, status, status_prev, tech_date_start, min_investment, min_investment_prev)][status != status_prev | is.na(status_prev) | min_investment != min_investment_prev | (!is.na(min_investment) & is.na(min_investment_prev)),]
x3_dt[profile_id %in% profile_x_id, ]

profile_id_mult_inv <- prof_dt[!is.na(min_investment), .N, .(profile_id, min_investment)][, .N, .(profile_id)][N > 1][order(-N)][, profile_id]
x3_dt[profile_id %in% profile_id_mult_inv & status == "Active", ]
x3_dt[profile_id %in% profile_id_mult_inv[3], ]





profile_id_mult_2_inv <- x3_dt[, .N, .(profile_id, min_investment)][, .N, .(profile_id)][N > 1][, profile_id]
x3_dt[profile_id %in% profile_id_mult_2_inv[1], ] 

profile_id_mult_2_inv[1:2]
dt <- prof_raw_dt

        #-- reduce changes to only those, having changed status, "flatten" all other changes, when status is not changed
setkey(dt, tech_date_start, profile_id)
dt[, `:=` (status_prev = shift(status, type = "lag", fill = NA), min_investment_prev = shift(min_investment, type = "lag", fill = NA)), .(profile_id)]

x3_dt <- dt[, .(profile_id, status, status_prev, tech_date_start, min_investment, min_investment_prev, min_investment_currency)][status != status_prev | is.na(status_prev) | min_investment != min_investment_prev | (!is.na(min_investment) & is.na(min_investment_prev)),]
#x3_dt[profile_id %in% profile_id_mult_inv[1], ]


        #-- analysis window
        boundary_dates <- data.table(date = c(lubridate::int_start(analysis_period),
                                              lubridate::int_end(analysis_period)))
        boundary_dates[, `:=` (date = as.Date(date))]
        setkey(boundary_dates, date)

        #-- limit profile_id to only those, which had Active status, sometime within the analysis window
        setkey(x3_dt, tech_date_start)
        b1 <- boundary_dates[x3_dt, roll = -Inf]
        b2 <- x3_dt[, .SD[boundary_dates, roll = +Inf], .(profile_id)]
        b3 <- rbindlist(list(b1[, .(profile_id, status, min_investment, date)],
                             b2[, .(profile_id, status, min_investment, date = tech_date_start)]))

        setkey(b3, date)

        b4 <- b3[, .SD[date %within% analysis_period & status == "Active", ], .(profile_id) ]
        b5 <- b4[, .N, .(profile_id)]
        setkey(b5, profile_id)

        #-- select only these profile_id, which have at least one day active within the period (the pupose of b5)
        setkey(x3_dt, profile_id)
        b6 <- x3_dt[b5][, .(profile_id, status, min_investment, min_investment_currency, date = tech_date_start)]
b6[profile_id %in% profile_id_mult_inv[4], ]


        #-- create dates sequence
        seq_dates <- seq.Date(from = as.Date(int_start(analysis_period)), to = as.Date(int_end(analysis_period)), by = "day")
        dates_dt <- data.table(date = seq_dates)
        setkey(dates_dt, date)

        setkey(b6, date)
        b7 <- b6[, .SD[dates_dt, roll = +Inf], .(profile_id)][, .(profile_id, status, min_investment, min_investment_currency, date)][!is.na(status)]
b7[profile_id %in% profile_id_mult_inv[4], ]

        return(b7)

#-- calculate bins for min_inv daily
b7_profile_id <- b7[, .N, .(profile_id, date)][N > 1, profile_id]    
b7[profile_id %in% b7_profile_id[1], ]
b7[, .N, .(status)]

b7[status == "Active" & date > "2020-12-16", .N, .(date, min_investment_currency)] %>%
  ggplot(aes(date, N, color = min_investment_currency)) +
  geom_line()


bin_from <- 10
bin_to <- 100
bin_by <- 20
b7[, `:=` (min_investment = min_investment / 1e3)]
b7[, `:=` (bins = cut(min_investment, breaks = c(-Inf, seq.int(from = bin_from, to = bin_to, by = bin_by), Inf), include.lowest = TRUE)), by = .(min_investment_currency, date)]
b7[, `:=` (bins = fct_rev(bins))]

currencies <- b7[, unique(min_investment_currency)]
b7 <- b7[status == "Active" & date > "2020-12-16" & min_investment_currency %in% currencies, ]
b7[, `:=` (n = 1)]
b7 <- b7[, `:=` (min_investment_currency = as.factor(min_investment_currency) %>% fct_reorder(.f = min_investment_currency, .x = n, .fun = sum) %>% fct_rev())]
b7 <- b7[, `:=` (min_investment_currency = fct_reorder(.f = min_investment_currency, .x = n, .fun = sum) %>% fct_rev())]
b7[, .(sum(n)), .(min_investment_currency)][order(-V1)]

b7 %>% str()

b7[status == "Active" & date > "2020-12-16" & min_investment_currency == "USD", .N, .(date, bins)] %>%
  ggplot(aes(date, N, color = bins, fill = bins)) +
  geom_area(alpha = 0.5) +
  scale_fill_brewer(palette = "Set1") +
  scale_color_brewer(palette = "Set1")


plot_01 <- b7[!is.na(min_investment_currency), .N, .(date, bins, min_investment_currency)] %>%
  ggplot(aes(date, N, color = bins, fill = bins)) +
  geom_area(alpha = 0.5) +
  scale_color_viridis_d(option = "") +
  scale_fill_viridis_d(option = "D") +
  # scale_fill_brewer(palette = "Set1") +
  # scale_color_brewer(palette = "Set1") +
  facet_wrap("min_investment_currency", scales = "free_y") +
  theme_bw() +
  labs(title = "Number of Profiles, binned, per currency.", subtitle = stringr::str_glue("Only Active Profiles."), x = "", y = "") +
  theme(legend.position = "right", legend.direction = "vertical")
get_profiles_investment_daily <- function(dt, analysis_period){

  #-- reduce changes to only those, having changed status, "flatten" all other changes, when status is not changed
  setkey(dt, tech_date_start, profile_id)
  dt[, `:=` (status_prev = shift(status, type = "lag", fill = NA), 
             min_investment_k_prev = shift(min_investment_k, type = "lag", fill = NA)), .(profile_id)]

  x3_dt <- dt[, .(profile_id, status, status_prev, tech_date_start, min_investment_k, min_investment_k_prev, min_investment_currency)
              ][status != status_prev | is.na(status_prev) | min_investment_k != min_investment_k_prev | (!is.na(min_investment_k) & is.na(min_investment_k_prev)),]

  #-- analysis window
  boundary_dates <- data.table(date = c(lubridate::int_start(analysis_period),
                                        lubridate::int_end(analysis_period)))
  boundary_dates[, `:=` (date = as.Date(date))]
  setkey(boundary_dates, date)

  #-- limit profile_id to only those, which had Active status, sometime within the analysis window
  setkey(x3_dt, tech_date_start)
  b1 <- boundary_dates[x3_dt, roll = -Inf]
  b2 <- x3_dt[, .SD[boundary_dates, roll = +Inf], .(profile_id)]
  b3 <- rbindlist(list(b1[, .(profile_id, status, min_investment_k, date)],
                       b2[, .(profile_id, status, min_investment_k, date = tech_date_start)]))

  setkey(b3, date)
  b4 <- b3[, .SD[date %within% analysis_period & status == "Active", ], .(profile_id) ]
  b5 <- b4[, .N, .(profile_id)]
  setkey(b5, profile_id)

  #-- select only these profile_id, which have at least one day active within the period (the pupose of b5)
  setkey(x3_dt, profile_id)
  b6 <- x3_dt[b5][, .(profile_id, status, min_investment_k, min_investment_currency, date = tech_date_start)]

  #-- create dates sequence
  seq_dates <- seq.Date(from = as.Date(int_start(analysis_period)), to = as.Date(int_end(analysis_period)), by = "day")
  dates_dt <- data.table(date = seq_dates)
  setkey(dates_dt, date)

  setkey(b6, date)
  b7 <- b6[, .SD[dates_dt, roll = +Inf], .(profile_id)][, .(profile_id, status, min_investment_k, min_investment_currency, date)][!is.na(status)]

  return(b7)
}


get_investment_bins_daily <- function(dt){

  bin_from <- 10
  bin_to <- 100
  bin_by <- 20
  currencies <- prof_inv_dt[, unique(min_investment_currency)]

  dt[, `:=` (bins = cut(min_investment_k, breaks = c(-Inf, seq.int(from = bin_from, to = bin_to, by = bin_by), Inf), include.lowest = TRUE)), by = .(min_investment_currency, date)]
  dt[, `:=` (bins = fct_rev(bins))]

  dt[, `:=` (min_investment_currency = fct_reorder(.f = min_investment_currency, .x = n, .fun = sum) %>% fct_rev())]

  return(dt)
}


#-- test functions
prof_raw_dt <- fd.dashboard::get_clean_profiles()
analysis_period <- "2020-12-16" %--% today()

prof_dt <- prof_raw_dt
prof_dt[, `:=` (min_investment_k = min_investment / 1e3)]

prof_inv_dt <- get_profiles_investment_daily(dt = prof_dt, analysis_period = analysis_period)
prof_inv_dt[, `:=` (n = 1)]

prof_inv_dt <- get_investment_bins_daily(prof_inv_dt)

plot_01 <- prof_inv_dt[!is.na(min_investment_currency) & status == "Active", .N, .(date, bins, min_investment_currency)] %>%
  ggplot(aes(date, N, color = bins, fill = bins)) +
  geom_area(alpha = 0.5) +
  scale_color_viridis_d(option = "D") +
  scale_fill_viridis_d(option = "D") +
  facet_wrap("min_investment_currency", scales = "free_y") +
  theme_bw() +
  labs(title = "Number of Profiles, binned, per currency.", subtitle = stringr::str_glue("Only Active Profiles."), x = "", y = "") +
  theme(legend.position = "bottom") +
  guides(colour = guide_legend(nrow = 1))

setkey(prof_inv_dt, date)
prof_inv_dt[, .N, .(min_investment_currency) ]

prof_inv_dt[date == max(date) & !is.na(min_investment_currency), lapply(.SD, fivenum), .(min_investment_currency), .SDcols = "min_investment_k"]

prof_inv_dt[date == max(date) & !is.na(min_investment_currency) & min_investment_k <= 100 & status == "Active", .(curr = min_investment_currency, min_investment_k)] %>%
  ggpubr::ggdensity(x = "min_investment_k", y = "..density..", color = "curr", fill = "curr", alpha = 0.3, add = "median") +
  theme_classic2() 
#-- trim leads
analysis_period <- "2020-12-01" %--% today()

lead_dt <- lead_raw_dt[Date %within% analysis_period]


prof_raw_dt[, .N, .(client_id, profile_id)][, .N, .(client_id)][N > 1]
prof_raw_dt[, .N, .(client_id, profile_id)][, .N, .(profile_id)][N > 1]

prof_raw_dt[profile_id == "0000015c-d713-d671-affd-d7bfba8c0000"]
lead_raw_dt[profile_id == "0000015c-d713-d671-affd-d7bfba8c0000"]

profile_x_id <- "0000015c-d713-d671-affd-d7bfe6fb0000"

prof_raw_dt[profile_id %in% profile_x_id]
lead_dt[profile_id %in% profile_x_id]

x1 <- get_active_profiles_daily(dt = prof_dt, analysis_period = analysis_period)


piotrgruszecki/fd.dashboard documentation built on March 21, 2021, 6:16 p.m.