R/data_transformation.R

Defines functions left_merge fscale_zscore fscale_linear01 fscale_sigmoid ym scale_dt rolling_trend

#' Fast data.table left join by reference
#' @param match_vars Character vector containing columns names to join. If NULL,
#' it will use the keys of dt1, if dt1 is a keyed data.table
#' @return A data.table left join of dt1 and dt2
left_merge <- function(dt1, dt2, match_vars = NULL){
  if(is.null(match_vars)){
    if(!is.data.table(dt1) | is.null(key(dt1))){
      stop('Found no key columns on dt1. Please specify the column names to join.')
    }
    if(!is.null(key(dt1))){
      match_vars <- key(dt1)
    }else{
      match_vars <- colnames(dt1)[colnames(dt1) %in% colnames(dt2)]
    }
  }
  setDT(dt1, key = match_vars)
  setDT(dt2, key = match_vars)

  cat('Joining tables by =', paste(match_vars, collapse = ', '), '; by reference over the left data.table. \n'  )
  cols_to_add <- colnames(dt2) %nin% match_vars
  dt1[dt2, on = match_vars, names(dt2)[(cols_to_add)] := mget(paste0("i.", names(dt2)[(cols_to_add)]))]
  return(dt1 %>% setkeyv(match_vars))
}

#' Z-Score scaling
fscale_zscore <- function(x){
  if(is.numeric(x)){
    1/(1 + exp(-x))
  }else if(is.data.frame(x)){
    categoricals <- discard(x, is.numeric)
    numerics <- keep(as.data.table(x), is.numeric)[, lapply(.SD,
                                                            function(x) (x-mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE))]
    if(ncol(numerics) == 0){
      warning('No numerical columns found.')
    }
    cbind(categoricals, numerics) %>% as.data.table()
  }else{
    stop('Must provide either a numeric verctor or data.frame')
  }
}

#' 0-1 linear scaling
fscale_linear01 <- function(x){
  if(is.numeric(x)){
    1/(1 + exp(-x))
  }else if(is.data.frame(x)){
    categoricals <- discard(x, is.numeric)
    numerics <- keep(as.data.table(x), is.numeric)[, lapply(.SD,
                                                            function(x) (x-min(x, na.rm = TRUE))/(max(x, na.rm = TRUE)-min(x, na.rm = TRUE)))]
    if(ncol(numerics) == 0){
      warning('No numerical columns found.')
    }
    cbind(categoricals, numerics) %>% as.data.table()
  }
}

# 0-1 sigmoid scaling
fscale_sigmoid <- function(x){
  if(is.numeric(x)){
    1/(1 + exp(-x))
  }else if(is.data.frame(x)){
    categoricals <- discard(x, is.numeric)
    numerics <- keep(as.data.table(x), is.numeric)[, lapply(.SD, function(x) 1/(1 + exp(-x)))]
    if(ncol(numerics) == 0){
      warning('No numerical columns found.')
    }
    cbind(categoricals, numerics) %>% as.data.table()
  }
}

#' Gets year-month character from a date
ym <- function(date){
  month <- data.table::month(date)
  month <- ifelse(nchar(month) == 1, paste0('0', month), month)
  paste(data.table::year(date), month, sep = '.')
}

scale_dt <- function(dt, numeric_only = F, scaling_function = dq::fscale_sigmoid()){
  scale.cols <- colnames(keep(dt, is.numeric))
  tabla_scale <- keep(dt, is.numeric)[,lapply(.SD, scaling_function), .SDcols = scale.cols]
  if(!numeric_only){
    tabla_scale <- cbind(discard(dt, is.numeric), tabla_scale)
  }
  return(tabla_scale)
}

rolling_trend <- function(dt, xvar, yvar, window_variable, window_length){
  # identify x values belonging to each window
  sorted_window_values <- suniq(dt[[window_variable]])
  window_sets <- embed(sorted_window_values, window_length) %>% split(1:nrow(.))
  rolled_trend <- window_sets %>% map( ~ dt[dt[[window_variable]] %in% .x] %>% lm(formula = as.formula(paste(xvar, yvar, sep = '~')))
  ) %>% map_dbl(~.x$coefficients[2])
  return(rolled_trend)
}

zooming_trend <- function(dt, xvar, yvar, zoom_variable, direction = c('left-to-right', 'right-to-left')) {
  sorted_zoom_values <- suniq(dt[[zoom_variable]])
  # start off with all values until having only the last one
  direction <- ifelse(direction[1] == 'left-to-right', 'backward', 'forward' )
  zoom_sets <- sorted_zoom_values %>% purrr::accumulate(append, .dir = direction)

  zoomed_trend <- zoom_sets %>% map(~ dt[dt[[zoom_variable]] %in% .x] %>% lm(formula = as.formula(paste(xvar, yvar, sep = '~')))) %>%
    map_dbl( ~.x$coefficients[2])
  return(zoomed_trend)
}

#' Segregated metrics for individuals based on dated observations
#'
#'
#'
#' @param dt data.frame to be analysed.
#' @param id_var_name name of the column in dt that contains the unique identifier.
#' @param num_var_name name of the column in dt that contains the numerical data to be analysed.
#' @param date_var_name name of the column in dt that contains the dates.
#' @param date_format chacater string detailing the format of the date column
#' @param grouping_variables character vector detailing which subgroups will be computed for all variables built from num_var
#' @param label prefix to add to all computed columns so the origin of them can be traced after table joining
var_factory <- function(dt, id_var_name, num_var_name, date_var_name, grouping_variables = NULL, label = NULL, write = FALSE, directory = getwd()) {

  # data.table setup
  # warning('var_factory is currently defined to only process daily data')

  cat('\n\nVariable Factory \n\nCreating summary table for ', label,
      ', grouped by ',  id_var_name, ifelse(test = !is.null(grouping_variables), paste0(' and segregated by ', grouping_variables, '.'),'.'), '\n', sep = '')

  cat('Setting up data.table for calculations. \nComputing preliminary variables...\n')

  if(class(dt[[date_var_name]]) %>% stringr::str_detect('Date|POSIX', negate = TRUE)){
    stop(paste(date_var_name, 'must be a date vector.'))
  }

  tictoc::tic()
  setDT(dt, key = c(id_var_name, grouping_variables))

  #copy so data.table operations don't modify by reference over the original
  dt1 <- data.table::copy(dt)

  # preliminary variables
  dt1[, id_var   := as.character(eval(as.name(id_var_name)))]
  dt1[, num_var  := as.numeric(eval(as.name(num_var_name)))] #it's more stable and less expensive to create a duplicate column than using eval on each call
  dt1[, date     := eval(as.name(date_var_name))]
  dt1[, max_date := max(date, na.rm = TRUE)]
  dt1[, month := ym(date)]
  dt1[, months_on_books := as.integer(floor((max_date - date)/30)+1), by = id_var]
  dt1[, total_active_months := data.table::uniqueN(month), by = id_var]
  dt1[, AMT_TOTAL_FOR_ID := sum(num_var, na.rm = TRUE), by = id_var]
  dt1[, N_TOTAL_FOR_ID := .N, by = id_var]
  setkey(dt1, 'id_var')
  tictoc::toc()

  # Variable computation
  tictoc::tic()
  cat('Computing table variables...\n')
  # when there are no variables to segregate, just create the variables for each distinct value of id_var
  if(is.null(grouping_variables)){
    id_table <- dt1[, .(AMOUNT = sum(num_var, na.rm = TRUE) %>% as.double(),
                        N = .N %>% as.numeric(), #to prevent type discrepancies
                        AV_AMOUNT_PERIOD = sum(num_var, na.rm = TRUE)/total_active_months,
                        AV_N_PERIOD = .N/total_active_months,
                        AV_TKT = mean(num_var, na.rm = TRUE),
                        MAX_AMOUNT =ifelse(max(num_var, na.rm = TRUE) == -Inf, 0, max(num_var)),
                        MIN_AMOUNT = ifelse(min(num_var, na.rm = TRUE) == Inf, 0, min(num_var)),
                        RECENCY = max_date - max(date, na.rm = TRUE),
                        MONTHS_ON_BOOKS = max(months_on_books),
                        ACTIVE_MONTHS = uniqueN(month)),
                    by = 'id_var'] %>%
      unique() #apparently because of the j definition, it returns duplicate rows
    tictoc::toc()
  }else{
    # create variables for all combinations of grouping_variables
    id_cube <- cube(dt1,
                    j =  .(AMOUNT = sum(num_var, na.rm = TRUE),
                           N = .N %>% as.numeric(),
                           AV_AMOUNT_PERIOD = sum(num_var, na.rm = TRUE)/total_active_months,
                           AV_N_PERIOD = .N/total_active_months,
                           AV_TKT = mean(num_var, na.rm = TRUE),
                           AMT_PERC = sum(num_var, na.rm = TRUE)/AMT_TOTAL_FOR_ID,
                           N_PERC = .N/N_TOTAL_FOR_ID,
                           MAX_AMOUNT = ifelse(max(num_var, na.rm = TRUE) == -Inf, 0, max(num_var)),
                           MIN_AMOUNT = ifelse(min(num_var, na.rm = TRUE) ==  Inf, 0, min(num_var)),
                           RECENCY = max_date - max(date, na.rm = TRUE),
                           MONTHS_ON_BOOKS = max(months_on_books),
                           ACTIVE_MONTHS = uniqueN(month)),
                    by = c('id_var', grouping_variables)) %>%
      unique() #apprently because of the j definition, it returns duplicate rows

    # replace NAs for the word TOTAL on all grouping variables
    walk(c('id_var', grouping_variables), ~id_cube[is.na(eval(parse(text = .x))), (.x) := 'TOTAL'])

    # creates the formula for data.table::dcast (format long to wide)
    formula <- paste('id_var', paste(grouping_variables, collapse = ' + ' ), sep = ' ~ ')
    tictoc::toc()

    tictoc::tic()
    cat('Reshaping table from long to wide...\n')

    #wide table
    id_table <- data.table::dcast(id_cube,
                                  formula = formula,
                                  fun.aggregate = sum,
                                  value.var = c('AMOUNT', 'N', 'AV_AMOUNT_PERIOD', 'AV_N_PERIOD', 'AV_TKT', 'AMT_PERC', 'N_PERC', 'MAX_AMOUNT', 'MIN_AMOUNT', 'RECENCY', 'MONTHS_ON_BOOKS', 'ACTIVE_MONTHS'))

    # remove TOTAL percentage columns since it's always 100%
    id_table <- id_table[, grep("PERC_TOTAL", colnames(id_table)):=NULL]

    # remove TOTALs row since it doesn't belong to any id_var
    id_table <- id_table[id_var != 'TOTAL']
    tictoc::toc()
  }
  rm(dt1)
  data.table::setkey(id_table, id_var)
  # add prefixes to columns for future table joining
  if(!is.null(label)){
    id_table <- dq::colname_prefix(dt = id_table, prefix = label, exclude = 'id_var')
  }

  # save output table into specified directory
  if(write){
    directory <- ifelse(is.null(directory), getwd(), directory)
    filename <- paste0(directory, '/', label, '_', id_var_name, '.csv')
    cat('\nWriting', filename, '\n')
    data.table::fwrite(id_table, filename, append = F, quote = F, row.names = F)
  }
  return(id_table)
}

#' Sets continuous values into bins.
#'
#' @param x Vector of continuous values.
#' @param bins numerical vector specifying the limits of each bin.
#' @examples
#' x <- runif(100)
#' bins <- seq(0,1, by = 0.2)
#' data.table(x = x, bin = bin(x, bins))
bin <- function(x, bins){
  if(any(!between(x,min(bins), max(bins)))){
    warning('x values outside of bin intervals. Adding extra bins to fit all values')
  }
  bins <- c(min(0, min(x)), sort(bins), max(x) + 1) %>% unique()
  x %>% map_dbl(function(x, y) y[which.min(x >= y)-1], bins)
}
pheymanss/dq documentation built on March 12, 2020, 1:29 a.m.