#' Fast data.table left join by reference
#' @param match_vars Character vector containing columns names to join. If NULL,
#' it will use the keys of dt1, if dt1 is a keyed data.table
#' @return A data.table left join of dt1 and dt2
left_merge <- function(dt1, dt2, match_vars = NULL){
if(is.null(match_vars)){
if(!is.data.table(dt1) | is.null(key(dt1))){
stop('Found no key columns on dt1. Please specify the column names to join.')
}
if(!is.null(key(dt1))){
match_vars <- key(dt1)
}else{
match_vars <- colnames(dt1)[colnames(dt1) %in% colnames(dt2)]
}
}
setDT(dt1, key = match_vars)
setDT(dt2, key = match_vars)
cat('Joining tables by =', paste(match_vars, collapse = ', '), '; by reference over the left data.table. \n' )
cols_to_add <- colnames(dt2) %nin% match_vars
dt1[dt2, on = match_vars, names(dt2)[(cols_to_add)] := mget(paste0("i.", names(dt2)[(cols_to_add)]))]
return(dt1 %>% setkeyv(match_vars))
}
#' Z-Score scaling
fscale_zscore <- function(x){
if(is.numeric(x)){
1/(1 + exp(-x))
}else if(is.data.frame(x)){
categoricals <- discard(x, is.numeric)
numerics <- keep(as.data.table(x), is.numeric)[, lapply(.SD,
function(x) (x-mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE))]
if(ncol(numerics) == 0){
warning('No numerical columns found.')
}
cbind(categoricals, numerics) %>% as.data.table()
}else{
stop('Must provide either a numeric verctor or data.frame')
}
}
#' 0-1 linear scaling
fscale_linear01 <- function(x){
if(is.numeric(x)){
1/(1 + exp(-x))
}else if(is.data.frame(x)){
categoricals <- discard(x, is.numeric)
numerics <- keep(as.data.table(x), is.numeric)[, lapply(.SD,
function(x) (x-min(x, na.rm = TRUE))/(max(x, na.rm = TRUE)-min(x, na.rm = TRUE)))]
if(ncol(numerics) == 0){
warning('No numerical columns found.')
}
cbind(categoricals, numerics) %>% as.data.table()
}
}
# 0-1 sigmoid scaling
fscale_sigmoid <- function(x){
if(is.numeric(x)){
1/(1 + exp(-x))
}else if(is.data.frame(x)){
categoricals <- discard(x, is.numeric)
numerics <- keep(as.data.table(x), is.numeric)[, lapply(.SD, function(x) 1/(1 + exp(-x)))]
if(ncol(numerics) == 0){
warning('No numerical columns found.')
}
cbind(categoricals, numerics) %>% as.data.table()
}
}
#' Gets year-month character from a date
ym <- function(date){
month <- data.table::month(date)
month <- ifelse(nchar(month) == 1, paste0('0', month), month)
paste(data.table::year(date), month, sep = '.')
}
scale_dt <- function(dt, numeric_only = F, scaling_function = dq::fscale_sigmoid()){
scale.cols <- colnames(keep(dt, is.numeric))
tabla_scale <- keep(dt, is.numeric)[,lapply(.SD, scaling_function), .SDcols = scale.cols]
if(!numeric_only){
tabla_scale <- cbind(discard(dt, is.numeric), tabla_scale)
}
return(tabla_scale)
}
rolling_trend <- function(dt, xvar, yvar, window_variable, window_length){
# identify x values belonging to each window
sorted_window_values <- suniq(dt[[window_variable]])
window_sets <- embed(sorted_window_values, window_length) %>% split(1:nrow(.))
rolled_trend <- window_sets %>% map( ~ dt[dt[[window_variable]] %in% .x] %>% lm(formula = as.formula(paste(xvar, yvar, sep = '~')))
) %>% map_dbl(~.x$coefficients[2])
return(rolled_trend)
}
zooming_trend <- function(dt, xvar, yvar, zoom_variable, direction = c('left-to-right', 'right-to-left')) {
sorted_zoom_values <- suniq(dt[[zoom_variable]])
# start off with all values until having only the last one
direction <- ifelse(direction[1] == 'left-to-right', 'backward', 'forward' )
zoom_sets <- sorted_zoom_values %>% purrr::accumulate(append, .dir = direction)
zoomed_trend <- zoom_sets %>% map(~ dt[dt[[zoom_variable]] %in% .x] %>% lm(formula = as.formula(paste(xvar, yvar, sep = '~')))) %>%
map_dbl( ~.x$coefficients[2])
return(zoomed_trend)
}
#' Segregated metrics for individuals based on dated observations
#'
#'
#'
#' @param dt data.frame to be analysed.
#' @param id_var_name name of the column in dt that contains the unique identifier.
#' @param num_var_name name of the column in dt that contains the numerical data to be analysed.
#' @param date_var_name name of the column in dt that contains the dates.
#' @param date_format chacater string detailing the format of the date column
#' @param grouping_variables character vector detailing which subgroups will be computed for all variables built from num_var
#' @param label prefix to add to all computed columns so the origin of them can be traced after table joining
var_factory <- function(dt, id_var_name, num_var_name, date_var_name, grouping_variables = NULL, label = NULL, write = FALSE, directory = getwd()) {
# data.table setup
# warning('var_factory is currently defined to only process daily data')
cat('\n\nVariable Factory \n\nCreating summary table for ', label,
', grouped by ', id_var_name, ifelse(test = !is.null(grouping_variables), paste0(' and segregated by ', grouping_variables, '.'),'.'), '\n', sep = '')
cat('Setting up data.table for calculations. \nComputing preliminary variables...\n')
if(class(dt[[date_var_name]]) %>% stringr::str_detect('Date|POSIX', negate = TRUE)){
stop(paste(date_var_name, 'must be a date vector.'))
}
tictoc::tic()
setDT(dt, key = c(id_var_name, grouping_variables))
#copy so data.table operations don't modify by reference over the original
dt1 <- data.table::copy(dt)
# preliminary variables
dt1[, id_var := as.character(eval(as.name(id_var_name)))]
dt1[, num_var := as.numeric(eval(as.name(num_var_name)))] #it's more stable and less expensive to create a duplicate column than using eval on each call
dt1[, date := eval(as.name(date_var_name))]
dt1[, max_date := max(date, na.rm = TRUE)]
dt1[, month := ym(date)]
dt1[, months_on_books := as.integer(floor((max_date - date)/30)+1), by = id_var]
dt1[, total_active_months := data.table::uniqueN(month), by = id_var]
dt1[, AMT_TOTAL_FOR_ID := sum(num_var, na.rm = TRUE), by = id_var]
dt1[, N_TOTAL_FOR_ID := .N, by = id_var]
setkey(dt1, 'id_var')
tictoc::toc()
# Variable computation
tictoc::tic()
cat('Computing table variables...\n')
# when there are no variables to segregate, just create the variables for each distinct value of id_var
if(is.null(grouping_variables)){
id_table <- dt1[, .(AMOUNT = sum(num_var, na.rm = TRUE) %>% as.double(),
N = .N %>% as.numeric(), #to prevent type discrepancies
AV_AMOUNT_PERIOD = sum(num_var, na.rm = TRUE)/total_active_months,
AV_N_PERIOD = .N/total_active_months,
AV_TKT = mean(num_var, na.rm = TRUE),
MAX_AMOUNT =ifelse(max(num_var, na.rm = TRUE) == -Inf, 0, max(num_var)),
MIN_AMOUNT = ifelse(min(num_var, na.rm = TRUE) == Inf, 0, min(num_var)),
RECENCY = max_date - max(date, na.rm = TRUE),
MONTHS_ON_BOOKS = max(months_on_books),
ACTIVE_MONTHS = uniqueN(month)),
by = 'id_var'] %>%
unique() #apparently because of the j definition, it returns duplicate rows
tictoc::toc()
}else{
# create variables for all combinations of grouping_variables
id_cube <- cube(dt1,
j = .(AMOUNT = sum(num_var, na.rm = TRUE),
N = .N %>% as.numeric(),
AV_AMOUNT_PERIOD = sum(num_var, na.rm = TRUE)/total_active_months,
AV_N_PERIOD = .N/total_active_months,
AV_TKT = mean(num_var, na.rm = TRUE),
AMT_PERC = sum(num_var, na.rm = TRUE)/AMT_TOTAL_FOR_ID,
N_PERC = .N/N_TOTAL_FOR_ID,
MAX_AMOUNT = ifelse(max(num_var, na.rm = TRUE) == -Inf, 0, max(num_var)),
MIN_AMOUNT = ifelse(min(num_var, na.rm = TRUE) == Inf, 0, min(num_var)),
RECENCY = max_date - max(date, na.rm = TRUE),
MONTHS_ON_BOOKS = max(months_on_books),
ACTIVE_MONTHS = uniqueN(month)),
by = c('id_var', grouping_variables)) %>%
unique() #apprently because of the j definition, it returns duplicate rows
# replace NAs for the word TOTAL on all grouping variables
walk(c('id_var', grouping_variables), ~id_cube[is.na(eval(parse(text = .x))), (.x) := 'TOTAL'])
# creates the formula for data.table::dcast (format long to wide)
formula <- paste('id_var', paste(grouping_variables, collapse = ' + ' ), sep = ' ~ ')
tictoc::toc()
tictoc::tic()
cat('Reshaping table from long to wide...\n')
#wide table
id_table <- data.table::dcast(id_cube,
formula = formula,
fun.aggregate = sum,
value.var = c('AMOUNT', 'N', 'AV_AMOUNT_PERIOD', 'AV_N_PERIOD', 'AV_TKT', 'AMT_PERC', 'N_PERC', 'MAX_AMOUNT', 'MIN_AMOUNT', 'RECENCY', 'MONTHS_ON_BOOKS', 'ACTIVE_MONTHS'))
# remove TOTAL percentage columns since it's always 100%
id_table <- id_table[, grep("PERC_TOTAL", colnames(id_table)):=NULL]
# remove TOTALs row since it doesn't belong to any id_var
id_table <- id_table[id_var != 'TOTAL']
tictoc::toc()
}
rm(dt1)
data.table::setkey(id_table, id_var)
# add prefixes to columns for future table joining
if(!is.null(label)){
id_table <- dq::colname_prefix(dt = id_table, prefix = label, exclude = 'id_var')
}
# save output table into specified directory
if(write){
directory <- ifelse(is.null(directory), getwd(), directory)
filename <- paste0(directory, '/', label, '_', id_var_name, '.csv')
cat('\nWriting', filename, '\n')
data.table::fwrite(id_table, filename, append = F, quote = F, row.names = F)
}
return(id_table)
}
#' Sets continuous values into bins.
#'
#' @param x Vector of continuous values.
#' @param bins numerical vector specifying the limits of each bin.
#' @examples
#' x <- runif(100)
#' bins <- seq(0,1, by = 0.2)
#' data.table(x = x, bin = bin(x, bins))
bin <- function(x, bins){
if(any(!between(x,min(bins), max(bins)))){
warning('x values outside of bin intervals. Adding extra bins to fit all values')
}
bins <- c(min(0, min(x)), sort(bins), max(x) + 1) %>% unique()
x %>% map_dbl(function(x, y) y[which.min(x >= y)-1], bins)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.