dq: Toolbox for Associate Level Analyst

# Data Communicating ------------------------------------------------------

#' Writes a list of data.frames into a single excel sheet
#' @param data_list A list of data.frames o be written on the same Esxcel sheet.
#' @param filename the name of the excel file to be created.
#' @param rows_between_tables the amount of excel rows separating each data.frame written on the sheet. Default is 1.
write_list_on_sheet <- function(data_list, filename, rows_between_tables = 1){
  # selects the list elements that are data.frames or matrices
  data_mat_indx <- purrr::map_lgl(data_list, function(x) 'data.frame' %in% class(x) || 'matrix' %in% class(x))
  if(!all(data_mat_indx)){
    warning(paste('Some elements of the list provided are not rectangular data:', names(data_list[!data_mat_indx])))
  }
  data_list <- data_list[data_mat_indx]
  tablenames <- names(data_list)
  wb <- openxlsx::createWorkbook()
  openxlsx::addWorksheet(wb, '1', gridLines = TRUE)
  startrow = 1
  for (table in data_list){

    openxlsx::writeDataTable(wb, sheet = 1, x = table, xy = c(1, startrow))
    startrow = startrow + nrow(table) + rows_between_tables + 1
  }
  openxlsx::saveWorkbook(wb, paste0(filename,'.xlsx'), overwrite = TRUE)
}


# wrapper over data.table::fwrite adding date and/or time tags to the filename
save_to_csv <- function(data, filename, include_date = TRUE, include_time = FALSE, ...){
  # extract file type if filename already contains one
  if(filename %>% stringr::str_detect('\\.[:alpha:]{3}')){
    filetype <- filename %>% stringr::str_extract('\\.[:alpha:]{3}')
    filename <- filename %>% stringr::str_remove(filetype)
  }else{ # if not, file will be saved as .csv
    filetype <- '.csv'
  }

  # add date to filename
  if(include_date){
    filename <-  paste(filename,
                       Sys.time() %>% stringr::str_sub(1, 10) %>% stringr::str_remove_all('-'),
                       sep = '_')
  }
  # add time to filename
  if(include_time){
    filename <- paste(filename,
                      Sys.time() %>% stringr::str_sub(12, 16) %>% stringr::str_remove_all(':'),
                      sep = '_')
  }
  data.table::fwrite(data, paste0(filename, filetype), nThread = 30, ...)
}

write_nested_list_on_sheets <- function(data_lists, filename, rows_between_tables, tbstyle = 'TableStyleLight1'){
  # create workbook object
  wb <- openxlsx::createWorkbook()
  walk2(data_lists, names(data_lists), add_sheet_wb, wb = wb, tbstyle = tbstyle, rows_between_tables = rows_between_tables)
  openxlsx::saveWorkbook(wb, paste0(filename,'.xlsx'), overwrite = TRUE)
}

add_sheet_wb <- function(wb, data_list, name, tbstyle = 'TableStyleLight1', rows_between_tables = 1){
  openxlsx::addWorksheet(wb, name, gridLines = FALSE)
  startrow = 1
  for (table in data_list){
    openxlsx::writeDataTable(wb, sheet = name, x = table, xy = c(1, startrow), tableStyle = tbstyle)
    startrow = startrow + nrow(table) + rows_between_tables + 1
  }
  return(wb)
}

#' Unique sorted values of a vector
#'
#' @param x A vector
#' @return A vector containing the unique values of x, sorted.
suniq <- function(x, decreasing = FALSE){x %>% unique() %>% sort(decreasing = FALSE)}

#' Masks an identification column with sequantial integers.
#' @param data data.frame with the column to be masked
#' @param column Character vector specifying the name of the colum to be masked.
#' @return a data.table with a new colum \code{id_enc}, a sequential integer
#' mapped from the masked column
masktercard <- function(data, column){
  setDT(data)[column != '', id_enc := as.character(.GRP), by = column]
  data[, (column) := NULL]
  warning('masktercard() deleted the column with the actual ID values')
  return(data)
}

#' Counts and percentages of each combination of variables
#'
#' Inspired by \code{janitor::tabyl}, computes the counts and precentages
#' of all value combinations of the specified columns in a data.frame.
#' @param vars character vector specifying the columns to count.
#' @return a data.table with each combination of values, and the counts and percentages for each combination.
data.tabyl <- function(dt, vars){
  if(!data.table::is.data.table(dt)){
    setDT(dt, key = vars)
  }
  dt[,.N, by = vars][,percentage := N/sum(N)][order(-N)]
}

# Counts the amount of observations from dt1 present in a subset of dt1 (dt2).
#
# @param by_cols Character vector specifying which columns will be considered for the matching
# @param percentage Boolean specifying if the function returns a percentage or the raw
#  counts of matches an no-matches
# @return
# match_coverage <- function(dt1, dt2, by_cols = NULL){
#   if(is.null(by_cols)){
#     by_cols <- intersect(colnames(dt1), colnames(dt2))
#     cat('Matching by ', paste(by_cols, collapse = ','), '\n')
#   }
#
#   data.table::setDT(dt1, key = by_cols)
#   data.table::setDT(dt2, key = by_cols)
#   return(nrow(dt1[dt2, on = by_cols, nomatch = 0]))
# }

#' Slope of a linear regression
trend <- function(x, y){ # 368% del tiempo de ejecución
  lm(x ~ y)$coefficients[2]
}

#' Slope between the first and last observations
#'
#' This is identical to the average slope between all observations.
slope <- function(x,y){
  as.numeric(tail(y, 1) - y[1])/as.numeric(tail(x, 1) - x[1])
}

#' Average slope between observations wieghted by the distance between x-axis observations
weighted_av_slope <- function(x, y){
  diff_x <- diff(x) %>% as.numeric()
  diff_y <- diff(y) %>% as.numeric()
  slopes <- diff_y/diff_x
  weights <- diff_x[is.finite(slopes)]/sum(diff_x[is.finite(slopes)])
  mean(weights*(slopes[is.finite(slopes)]))
}


#' Computes averages for all numerical variables specified.
#' @param dt table to be analised. It is set to a data.table
#' @param groups categorical variable mapping each observation into one group
#' @param numeric_summary_columns character vector with the names of the columns to compute. Default is all numerical variables.
#' @return A data.table with the averages of all variables
group_averages <- function(dt, group_col, nums = colnames(keep(dt, is.numeric))){
  # replace NAs
  dt1 <- data.table::copy(dt) %>% freplace_na()
  averages <- keep(dt1, colnames(dt1) %in% c(nums, group_col))[, map(.SD, mean, na.rm = TRUE), by = group_col]
  return(averages)
}

# group_stats <- function(dt, group_col, nums = colnames(keep(dt, is.numeric))){
#   # replace NAs
#   dt1 <- data.table::copy(dt) %>% freplace_na()
#   averages <- keep(dt1, colnames(dt1) %in% c(nums, group_col))[, map(.SD, mean, na.rm = TRUE), by = group_col]
#   return(averages)
# }

#' Computes averages of all specified numerical variables and distribuitions of categorical variables,
#' broken down by a specified column group.
#' @param dt Table containing the data.
#' @param group_col Character string specifying the nae of the column to be used as groupig variable.
#' @param nums Character vector of all the numerical variables to be analysed. Default is all numerical variables found on the table.
#' @param cats Character vector of all the categorical variables to be analysed. Default is all categorical variables found on the table.
#' @param compare_with_total Logical variable specifying wheteher the function also retunrs the percentage difference between each group
#' value and the population average
#' @return A data.table with a column for each group, with the average values of all numerical variables specified and the distribuitions of all categorical variables
grouped_report <- function(dt, group_col,
                           nums = colnames(keep(dt, is.numeric)) %>% stringr::str_subset(group_col, negate = TRUE),
                           cats = colnames(purrr::discard(dt, is.numeric)) %>% stringr::str_subset(group_col, negate = TRUE),
                           compare_with_total = FALSE){
  cat('Computing population distribuition...\n')

  # categorical distribuitions ----------------------------------------------
  categorical_percentages <- NULL
  total_population_cat_perc <- cats %>%
    # i. use dq::data.tabyl to get the percentages of each value for each categorical variable
    purrr::map(~data.tabyl(dt, .x)[, c(1,3)]) %>%
    # ii. add the name of the column as prefix to each of its possible values
    purrr::map2(cats, ~data.table(var = paste(.y, .x[[1]], sep = '_'), percentage = .x[[2]])) %>%
    # iii. bind for easy merging with group-segregated percentages
    data.table::rbindlist() %>% purrr::set_names(c('var', 'total_population'))
  categorical_percentages <- dt %>% split(dt[[group_col]]) %>%
    # i. for each group, compute the distribuition of each categorical column
    purrr::map(function(dt, cat, group) purrr::map(cat, ~data.tabyl(dt, c(.x, group))[,c(1,2,4)]), cat = cats, group = group_col) %>%
    # ii. add the name of the column as prefix to each of its possible values
    purrr::map(~purrr::map2(., cats, function(dt, prefix) data.table::data.table(var = paste(prefix, dt[[1]], sep = '_'),
                                                                                 group = dt[[2]],
                                                                                 percentage = dt[[3]]))) %>%
    # iii. bind alll tables for rechaping
    purrr::map(rbindlist) %>% data.table::rbindlist() %>%
    # iv. reshape into report-friendly format
    data.table::dcast(var ~ group, value.var = 'percentage') %>%
    # v. join with total population distribuitions
    merge(total_population_cat_perc, by = 'var')

  # average values----------------------------------------------------------
  cat('Computing total population averages...\n')
  group_averages_dt <- NULL
  # build stats for each cluster and for the global population
  population_averages <- dt[, ..nums][, lapply(.SD, mean, na.rm = TRUE)][, (group_col) := 'population_average']

  cat('Computing group averages...\n')
  group_averages_dt <- group_averages(dt, group_col, nums)[, (group_col) := as.character(eval(as.name(group_col)))] %>%
    rbind(population_averages, use.names = TRUE)
  var_names <- colnames(group_averages_dt[, -1])
  group_averages_dt <-  group_averages_dt[, -1] %>% data.table::transpose()
  group_averages_dt <-  group_averages_dt[, var := var_names][,c(ncol(..group_averages_dt),1:(ncol(..group_averages_dt)-1))]

  group_stats <- rbindlist(list(group_averages_dt, categorical_percentages), use.names = FALSE) %>% setnames(c('var', suniq(dt[[group_col]]), 'total_population'))

  if(compare_with_total){
    cat('Comparing group values against population averages...\n')
    vs_tot <- group_stats[, -1][, map(.SD, ~.x/group_stats$total_population-1), .SDcols = suniq(dt[[group_col]])] %>% colname_prefix('per_dist_av', sep = '.')
    group_stats <- group_stats %>% cbind(vs_tot)
  }

  return(group_stats)
}

#' data.table of the top n groups by count_col, segregated by
#' grouping_col
#' @param dt Table with grouping column and column to count.
#' @param grouping_col String, name of the column to segregate by.
#' @param count_col String, name of the column to count by.
#' @return A data.table with the names and counts of the top n observations from dt
top_by_group_counts <- function(dt, grouping_col, count_col, n){
  cluster_names <- c(dq::suniq(dt[[grouping_col]]), 'total_population')
  data.table::setDT(dt)
  topn <- dt %>% split(dt[[grouping_col]]) %>%
    purrr::append(list(total_population = dt)) %>%
    purrr::map(~dq::data.tabyl(.x, count_col)[1:n])
  purrr::map2(topn, cluster_names, ~purrr::set_names(.x, c(count_col, .y, paste0(.y, '.perc')))) %>%
    purrr::reduce(merge, all = TRUE, by = count_col)
}

#' data.table of the top n groups by count_col, segregated by
#' grouping_col
#' @param dt Table with grouping column and column to count.
#' @param grouping_col String, name of the column to segregate by.
#' @param count_col String, name of the column to count by.
#' @return A data.table with the names and total amounts of the top n observations from dt
top_by_group_amounts <- function(dt, grouping_col, count_col, n){
  cluster_names <- c(dq::suniq(dt[[grouping_col]]), 'total_population')
  data.table::setDT(dt)
  dt1 <- data.table::copy(dt)
  dt1[, amt := eval(as.name(count_col))]
  topamt <- dt1 %>% split(dt1[[grouping_col]]) %>%
    purrr::append(list(total_population = dt1)) %>%
    purrr::map(~.x[,.(amount = sum(amt)), by = count_col][, perc := amount/sum(amount)][order(-amount)][1:n])
  purrr::map2(topamt, cluster_names, ~purrr::set_names(.x, c(count_col, .y, paste0(.y, '.perc')))) %>%
    purrr::reduce(merge, all = TRUE, by = count_col)
}