R/data_communicating.R

Defines functions top_by_group_amounts top_by_group_counts grouped_report group_averages data.tabyl masktercard suniq write_list_on_sheet

# Data Communicating ------------------------------------------------------

#' Writes a list of data.frames into a single excel sheet
#' @param data_list A list of data.frames o be written on the same Esxcel sheet.
#' @param filename the name of the excel file to be created.
#' @param rows_between_tables the amount of excel rows separating each data.frame written on the sheet. Default is 1.
write_list_on_sheet <- function(data_list, filename, rows_between_tables = 1){
  # selects the list elements that are data.frames or matrices
  data_mat_indx <- purrr::map_lgl(data_list, function(x) 'data.frame' %in% class(x) || 'matrix' %in% class(x))
  if(!all(data_mat_indx)){
    warning(paste('Some elements of the list provided are not rectangular data:', names(data_list[!data_mat_indx])))
  }
  data_list <- data_list[data_mat_indx]
  wb <- openxlsx::createWorkbook()
  openxlsx::addWorksheet(wb, '1', gridLines = TRUE)
  startrow = 1
  for (table in data_list){
    openxlsx::writeDataTable(wb, sheet = 1, x = table, xy = c(1, startrow))
    startrow = startrow + nrow(table) + rows_between_tables + 1
  }
  openxlsx::saveWorkbook(wb, paste0(filename,'.xlsx'), overwrite = TRUE)
}

#' Unique sorted values of a vector
#'
#' @param x A vector
#' @return A vector containing the unique values of x, sorted.
suniq <- function(x){x %>% unique() %>% sort()}

#' Masks an identification column with sequantial integers.
#' @param data data.frame with the column to be masked
#' @param column Character vector specifying the name of the colum to be masked.
#' @return a data.table with a new colum \code{id_enc}, a sequential integer
#' mapped from the masked column
masktercard <- function(data, column){
  setDT(data)[column != '', id_enc := as.character(.GRP), by = column]
  data[, (column) := NULL]
  warning('masktercard() deleted the column with the actual ID values')
}

#' Counts and percentages of each combination of variables
#'
#' Inspired by \code{janitor::tabyl}, computes the counts and precentages
#' of all value combinations of the specified columns in a data.frame.
#' @param vars character vector specifying the columns to count.
#' @return a data.table with each combination of values, and the counts and percentages for each combination.
data.tabyl <- function(dt, vars){
  if(!is.data.table(dt)){
    setDT(dt, key = vars)
  }
  dt[,.N, by = vars][,percentage := N/sum(N)][order(-N)]
}

# Counts the amount of observations from dt1 present in a subset of dt1 (dt2).
#
# @param by_cols Character vector specifying which columns will be considered for the matching
# @param percentage Boolean specifying if the function returns a percentage or the raw
#  counts of matches an no-matches
# @return
# match_coverage <- function(dt1, dt2, by_cols = NULL){
#   if(is.null(by_cols)){
#     by_cols <- intersect(colnames(dt1), colnames(dt2))
#     cat('Matching by ', paste(by_cols, collapse = ','), '\n')
#   }
#
#   data.table::setDT(dt1, key = by_cols)
#   data.table::setDT(dt2, key = by_cols)
#   return(nrow(dt1[dt2, on = by_cols, nomatch = 0]))
# }

#' Computes averages for all numerical variables specified.
#' @param dt table to be analised. It is set to a data.table
#' @param groups categorical variable mapping each observation into one group
#' @param numeric_summary_columns character vector with the names of the columns to compute. Default is all numerical variables.
#' @return A data.table with the averages of all variables
group_averages <- function(dt, groups, numeric_summary_columns = colnames(keep(dt, is.numeric))){
  # replace NAs
  dt1 <- data.table::copy(dt) %>% freplace_na()
  dt1[, group :=  ..groups]
  averages <- dt1[, c(..numeric_summary_columns, 'group')][, lapply(.SD, mean, na.rm = TRUE), by = 'group']
  return(averages)
}


#' Computes averages of all specified numerical variables and distribuitions of categorical variables,
#' broken down by a specified column group.
#' @param dt Table containing the data.
#' @param group_col Character string specifying the nae of the column to be used as groupig variable.
#' @param nums Character vector of all the numerical variables to be analysed. Default is all numerical variables found on the table.
#' @param cats Character vector of all the categorical variables to be analysed. Default is all categorical variables found on the table.
#' @param compare_with_total Logical variable specifying wheteher the function also retunrs the percentage difference between each group
#' value and the population average
#' @return A data.table with a column for each group, with the average values of all numerical variables specified and the distribuitions of all categorical variables
grouped_report <- function(dt, group_col,
                           nums = colnames(keep(dt, is.numeric)),
                           cats = colnames(purrr::discard(dt, is.numeric)) %>% stringr::str_subset(group_col, negate = TRUE),
                           compare_with_total = FALSE){
  cat('Computing population distribuition...\n')

  # categorical distribuitions ----------------------------------------------
  categorical_percentages <- NUL
  total_population_cat_perc <- cats %>%
    # i. use dq::data.tabyl to get the percentages of each value for each categorical variable
    purrr::map(~data.tabyl(dt, .x)[, c(1,3)]) %>%
    # ii. add the name of the column as prefix to each of its possible values
    purrr::map2(cats, ~data.table(var = paste(.y, .x[[1]], sep = '_'), percentage = .x[[2]])) %>%
    # iii. bind for easy merging with group-segregated percentages
    data.table::rbindlist() %>% purrr::set_names(c('var', 'total_population'))
  categorical_percentages <- dt %>% split(dt[[group_col]]) %>%
    # i. for each group, compute the distribuition of each categorical column
    purrr::map(function(dt, cat, group) purrr::map(cat, ~data.tabyl(dt, c(.x, group))[,c(1,2,4)]), cat = cats, group = group_col) %>%
    # ii. add the name of the column as prefix to each of its possible values
    purrr::map(~purrr::map2(., cats, function(dt, prefix) data.table::data.table(var = paste(prefix, dt[[1]], sep = '_'),
                                                                                 group = dt[[2]],
                                                                                 percentage = dt[[3]]))) %>%
    # iii. bind alll tables for rechaping
    purrr::map(rbindlist) %>% data.table::rbindlist() %>%
    # iv. reshape into report-friendly format
    data.table::dcast(var ~ group, value.var = 'percentage') %>%
    # v. join with total population distribuitions
    merge(total_population_cat_perc, by = 'var')

  # average values----------------------------------------------------------
  cat('Computing total population averages...\n')
  group_averages_dt <- NULL
  # build stats for each cluster and for the global population
  population_averages <- dt[, ..nums][, lapply(.SD, mean, na.rm = TRUE)][, group := 'population_average']

  cat('Computing group averages...\n')
  group_averages_dt <- group_averages(dt, groups, nums =  nums)[order(group)] %>%
    rbind(population_averages, use.names = TRUE)
  var_names <- colnames(group_averages_dt[, -1])
  group_averages_dt <-  group_averages_dt[, -1] %>% data.table::transpose()
  group_averages_dt <-  group_averages_dt[, var := var_names][,c(ncol(..group_averages_dt),1:(ncol(..group_averages_dt)-1))]

  group_stats <- rbindlist(list(group_averages_dt, categorical_percentages), use.names = FALSE) %>% setnames(c('var', suniq(dt[[group_col]]), 'total_population'))

  if(compare_with_total){
    cat('Comparing group values against population averages...\n')
    vs_tot <- group_stats[, -1][, map(.SD, ~.x/group_stats$total_population-1), .SDcols = suniq(dt[[group_col]])] %>% colname_prefix('per_dist_av', sep = '.')
    group_stats <- group_stats %>% cbind(vs_tot)
  }

  return(group_stats)
}

#' data.table of the top n groups by count_col, segregated by
#' grouping_col
#' @param dt Table with grouping column and column to count.
#' @param grouping_col String, name of the column to segregate by.
#' @param count_col String, name of the column to count by.
#'

top_by_group_counts <- function(dt, grouping_col, count_col, n){
  cluster_names <- c(dq::suniq(dt[[grouping_col]]), 'total_population')
  data.table::setDT(dt)
  topn <- dt %>% split(dt[[grouping_col]]) %>%
    purrr::append(list(total_population = dt)) %>%
    purrr::map(~dq::data.tabyl(.x, count_col)[1:n])
  purrr::map2(topn, cluster_names, ~purrr::set_names(.x, c(count_col, .y, paste0(.y, '.perc')))) %>%
    purrr::reduce(merge, all = TRUE, by = count_col)
}

top_by_group_amounts <- function(dt, grouping_col, count_col, n){
  cluster_names <- c(dq::suniq(dt[[grouping_col]]), 'total_population')
  data.table::setDT(dt)
  dt1 <- data.table::copy(dt)
  dt1[, amt := eval(as.name(count_col))]
  topamt <- dt1 %>% split(dt1[[grouping_col]]) %>%
    purrr::append(list(total_population = dt1)) %>%
    purrr::map(~.x[,.(amount = sum(amt)), by = count_col][, perc := amount/sum(amount)][order(-amount)][1:n])
  purrr::map2(topamt, cluster_names, ~purrr::set_names(.x, c(count_col, .y, paste0(.y, '.perc')))) %>%
    purrr::reduce(merge, all = TRUE, by = count_col)
}
pheymanss/dq documentation built on Jan. 17, 2020, 1:09 p.m.