# Data Communicating ------------------------------------------------------
#' Writes a list of data.frames into a single excel sheet
#' @param data_list A list of data.frames o be written on the same Esxcel sheet.
#' @param filename the name of the excel file to be created.
#' @param rows_between_tables the amount of excel rows separating each data.frame written on the sheet. Default is 1.
write_list_on_sheet <- function(data_list, filename, rows_between_tables = 1){
# selects the list elements that are data.frames or matrices
data_mat_indx <- purrr::map_lgl(data_list, function(x) 'data.frame' %in% class(x) || 'matrix' %in% class(x))
if(!all(data_mat_indx)){
warning(paste('Some elements of the list provided are not rectangular data:', names(data_list[!data_mat_indx])))
}
data_list <- data_list[data_mat_indx]
tablenames <- names(data_list)
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, '1', gridLines = TRUE)
startrow = 1
for (table in data_list){
openxlsx::writeDataTable(wb, sheet = 1, x = table, xy = c(1, startrow))
startrow = startrow + nrow(table) + rows_between_tables + 1
}
openxlsx::saveWorkbook(wb, paste0(filename,'.xlsx'), overwrite = TRUE)
}
# wrapper over data.table::fwrite adding date and/or time tags to the filename
save_to_csv <- function(data, filename, include_date = TRUE, include_time = FALSE, ...){
# extract file type if filename already contains one
if(filename %>% stringr::str_detect('\\.[:alpha:]{3}')){
filetype <- filename %>% stringr::str_extract('\\.[:alpha:]{3}')
filename <- filename %>% stringr::str_remove(filetype)
}else{ # if not, file will be saved as .csv
filetype <- '.csv'
}
# add date to filename
if(include_date){
filename <- paste(filename,
Sys.time() %>% stringr::str_sub(1, 10) %>% stringr::str_remove_all('-'),
sep = '_')
}
# add time to filename
if(include_time){
filename <- paste(filename,
Sys.time() %>% stringr::str_sub(12, 16) %>% stringr::str_remove_all(':'),
sep = '_')
}
data.table::fwrite(data, paste0(filename, filetype), nThread = 30, ...)
}
write_nested_list_on_sheets <- function(data_lists, filename, rows_between_tables, tbstyle = 'TableStyleLight1'){
# create workbook object
wb <- openxlsx::createWorkbook()
walk2(data_lists, names(data_lists), add_sheet_wb, wb = wb, tbstyle = tbstyle, rows_between_tables = rows_between_tables)
openxlsx::saveWorkbook(wb, paste0(filename,'.xlsx'), overwrite = TRUE)
}
add_sheet_wb <- function(wb, data_list, name, tbstyle = 'TableStyleLight1', rows_between_tables = 1){
openxlsx::addWorksheet(wb, name, gridLines = FALSE)
startrow = 1
for (table in data_list){
openxlsx::writeDataTable(wb, sheet = name, x = table, xy = c(1, startrow), tableStyle = tbstyle)
startrow = startrow + nrow(table) + rows_between_tables + 1
}
return(wb)
}
#' Unique sorted values of a vector
#'
#' @param x A vector
#' @return A vector containing the unique values of x, sorted.
suniq <- function(x, decreasing = FALSE){x %>% unique() %>% sort(decreasing = FALSE)}
#' Masks an identification column with sequantial integers.
#' @param data data.frame with the column to be masked
#' @param column Character vector specifying the name of the colum to be masked.
#' @return a data.table with a new colum \code{id_enc}, a sequential integer
#' mapped from the masked column
masktercard <- function(data, column){
setDT(data)[column != '', id_enc := as.character(.GRP), by = column]
data[, (column) := NULL]
warning('masktercard() deleted the column with the actual ID values')
return(data)
}
#' Counts and percentages of each combination of variables
#'
#' Inspired by \code{janitor::tabyl}, computes the counts and precentages
#' of all value combinations of the specified columns in a data.frame.
#' @param vars character vector specifying the columns to count.
#' @return a data.table with each combination of values, and the counts and percentages for each combination.
data.tabyl <- function(dt, vars){
if(!data.table::is.data.table(dt)){
setDT(dt, key = vars)
}
dt[,.N, by = vars][,percentage := N/sum(N)][order(-N)]
}
# Counts the amount of observations from dt1 present in a subset of dt1 (dt2).
#
# @param by_cols Character vector specifying which columns will be considered for the matching
# @param percentage Boolean specifying if the function returns a percentage or the raw
# counts of matches an no-matches
# @return
# match_coverage <- function(dt1, dt2, by_cols = NULL){
# if(is.null(by_cols)){
# by_cols <- intersect(colnames(dt1), colnames(dt2))
# cat('Matching by ', paste(by_cols, collapse = ','), '\n')
# }
#
# data.table::setDT(dt1, key = by_cols)
# data.table::setDT(dt2, key = by_cols)
# return(nrow(dt1[dt2, on = by_cols, nomatch = 0]))
# }
#' Slope of a linear regression
trend <- function(x, y){ # 368% del tiempo de ejecuciĆ³n
lm(x ~ y)$coefficients[2]
}
#' Slope between the first and last observations
#'
#' This is identical to the average slope between all observations.
slope <- function(x,y){
as.numeric(tail(y, 1) - y[1])/as.numeric(tail(x, 1) - x[1])
}
#' Average slope between observations wieghted by the distance between x-axis observations
weighted_av_slope <- function(x, y){
diff_x <- diff(x) %>% as.numeric()
diff_y <- diff(y) %>% as.numeric()
slopes <- diff_y/diff_x
weights <- diff_x[is.finite(slopes)]/sum(diff_x[is.finite(slopes)])
mean(weights*(slopes[is.finite(slopes)]))
}
#' Computes averages for all numerical variables specified.
#' @param dt table to be analised. It is set to a data.table
#' @param groups categorical variable mapping each observation into one group
#' @param numeric_summary_columns character vector with the names of the columns to compute. Default is all numerical variables.
#' @return A data.table with the averages of all variables
group_averages <- function(dt, group_col, nums = colnames(keep(dt, is.numeric))){
# replace NAs
dt1 <- data.table::copy(dt) %>% freplace_na()
averages <- keep(dt1, colnames(dt1) %in% c(nums, group_col))[, map(.SD, mean, na.rm = TRUE), by = group_col]
return(averages)
}
# group_stats <- function(dt, group_col, nums = colnames(keep(dt, is.numeric))){
# # replace NAs
# dt1 <- data.table::copy(dt) %>% freplace_na()
# averages <- keep(dt1, colnames(dt1) %in% c(nums, group_col))[, map(.SD, mean, na.rm = TRUE), by = group_col]
# return(averages)
# }
#' Computes averages of all specified numerical variables and distribuitions of categorical variables,
#' broken down by a specified column group.
#' @param dt Table containing the data.
#' @param group_col Character string specifying the nae of the column to be used as groupig variable.
#' @param nums Character vector of all the numerical variables to be analysed. Default is all numerical variables found on the table.
#' @param cats Character vector of all the categorical variables to be analysed. Default is all categorical variables found on the table.
#' @param compare_with_total Logical variable specifying wheteher the function also retunrs the percentage difference between each group
#' value and the population average
#' @return A data.table with a column for each group, with the average values of all numerical variables specified and the distribuitions of all categorical variables
grouped_report <- function(dt, group_col,
nums = colnames(keep(dt, is.numeric)) %>% stringr::str_subset(group_col, negate = TRUE),
cats = colnames(purrr::discard(dt, is.numeric)) %>% stringr::str_subset(group_col, negate = TRUE),
compare_with_total = FALSE){
cat('Computing population distribuition...\n')
# categorical distribuitions ----------------------------------------------
categorical_percentages <- NULL
total_population_cat_perc <- cats %>%
# i. use dq::data.tabyl to get the percentages of each value for each categorical variable
purrr::map(~data.tabyl(dt, .x)[, c(1,3)]) %>%
# ii. add the name of the column as prefix to each of its possible values
purrr::map2(cats, ~data.table(var = paste(.y, .x[[1]], sep = '_'), percentage = .x[[2]])) %>%
# iii. bind for easy merging with group-segregated percentages
data.table::rbindlist() %>% purrr::set_names(c('var', 'total_population'))
categorical_percentages <- dt %>% split(dt[[group_col]]) %>%
# i. for each group, compute the distribuition of each categorical column
purrr::map(function(dt, cat, group) purrr::map(cat, ~data.tabyl(dt, c(.x, group))[,c(1,2,4)]), cat = cats, group = group_col) %>%
# ii. add the name of the column as prefix to each of its possible values
purrr::map(~purrr::map2(., cats, function(dt, prefix) data.table::data.table(var = paste(prefix, dt[[1]], sep = '_'),
group = dt[[2]],
percentage = dt[[3]]))) %>%
# iii. bind alll tables for rechaping
purrr::map(rbindlist) %>% data.table::rbindlist() %>%
# iv. reshape into report-friendly format
data.table::dcast(var ~ group, value.var = 'percentage') %>%
# v. join with total population distribuitions
merge(total_population_cat_perc, by = 'var')
# average values----------------------------------------------------------
cat('Computing total population averages...\n')
group_averages_dt <- NULL
# build stats for each cluster and for the global population
population_averages <- dt[, ..nums][, lapply(.SD, mean, na.rm = TRUE)][, (group_col) := 'population_average']
cat('Computing group averages...\n')
group_averages_dt <- group_averages(dt, group_col, nums)[, (group_col) := as.character(eval(as.name(group_col)))] %>%
rbind(population_averages, use.names = TRUE)
var_names <- colnames(group_averages_dt[, -1])
group_averages_dt <- group_averages_dt[, -1] %>% data.table::transpose()
group_averages_dt <- group_averages_dt[, var := var_names][,c(ncol(..group_averages_dt),1:(ncol(..group_averages_dt)-1))]
group_stats <- rbindlist(list(group_averages_dt, categorical_percentages), use.names = FALSE) %>% setnames(c('var', suniq(dt[[group_col]]), 'total_population'))
if(compare_with_total){
cat('Comparing group values against population averages...\n')
vs_tot <- group_stats[, -1][, map(.SD, ~.x/group_stats$total_population-1), .SDcols = suniq(dt[[group_col]])] %>% colname_prefix('per_dist_av', sep = '.')
group_stats <- group_stats %>% cbind(vs_tot)
}
return(group_stats)
}
#' data.table of the top n groups by count_col, segregated by
#' grouping_col
#' @param dt Table with grouping column and column to count.
#' @param grouping_col String, name of the column to segregate by.
#' @param count_col String, name of the column to count by.
#' @return A data.table with the names and counts of the top n observations from dt
top_by_group_counts <- function(dt, grouping_col, count_col, n){
cluster_names <- c(dq::suniq(dt[[grouping_col]]), 'total_population')
data.table::setDT(dt)
topn <- dt %>% split(dt[[grouping_col]]) %>%
purrr::append(list(total_population = dt)) %>%
purrr::map(~dq::data.tabyl(.x, count_col)[1:n])
purrr::map2(topn, cluster_names, ~purrr::set_names(.x, c(count_col, .y, paste0(.y, '.perc')))) %>%
purrr::reduce(merge, all = TRUE, by = count_col)
}
#' data.table of the top n groups by count_col, segregated by
#' grouping_col
#' @param dt Table with grouping column and column to count.
#' @param grouping_col String, name of the column to segregate by.
#' @param count_col String, name of the column to count by.
#' @return A data.table with the names and total amounts of the top n observations from dt
top_by_group_amounts <- function(dt, grouping_col, count_col, n){
cluster_names <- c(dq::suniq(dt[[grouping_col]]), 'total_population')
data.table::setDT(dt)
dt1 <- data.table::copy(dt)
dt1[, amt := eval(as.name(count_col))]
topamt <- dt1 %>% split(dt1[[grouping_col]]) %>%
purrr::append(list(total_population = dt1)) %>%
purrr::map(~.x[,.(amount = sum(amt)), by = count_col][, perc := amount/sum(amount)][order(-amount)][1:n])
purrr::map2(topamt, cluster_names, ~purrr::set_names(.x, c(count_col, .y, paste0(.y, '.perc')))) %>%
purrr::reduce(merge, all = TRUE, by = count_col)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.