distrr: Estimate and Manage Empirical Distributions

Documented in dcc5 dcc6

# Data preparation, before the creation of the data cube ------------
prepare_data <- function(.data, 
                         .variables, 
                         .total = "Totale", 
                         order_type = extract_unique4) {
  l <- length(.variables)
  .other_vars <- setdiff(colnames(.data), .variables)
  
  # Estrazione dei valori unici (in ordine di apparizione nei dati)
  # in una lista di lunghezza l (l_lev)
  data_vars <- select2_(.data, .variables)
  
  l_lev <- order_type(data_vars)
  
  l_lev <- lapply(l_lev, function(x) x <- unique(c(.total, x)))
  
  # Trasformo le .variables in character
  data_vars <- dplyr::mutate_all(
    data_vars,
    .funs = "as.character"
  )
  
  data_others <- select2_(.data, .other_vars)
  
  data_new <- dplyr::bind_cols(data_vars, data_others)
  
  list(data_new = data_new, l_lev = l_lev, l = l)
}


# Last operations, after the creation of the cube. --------
# Reordering columns, creating factors, arranging rows, ...
finish_cube <- function(joint_all, .variables, l_lev, l) {
  # Reorder columns
  dots_all <- colnames(joint_all)
  measures <- setdiff(dots_all, .variables)
  dots_all <- c(.variables, measures)
  joint_all <- select2_(joint_all, dots_all)
  
  # Completamento delle righe per le quali non 
  # esistono combinazioni di variabili nei dati
  joint_all <- complete2_(
    joint_all, 
    .variables = .variables
  )

  # Trasformazione delle .variables in factor e 
  # assegnamento dei livelli in base a l_lev
  options(drop = FALSE)
  joint_all[, .variables] <- dplyr::mutate_all(
    joint_all[ , .variables],
    .funs = "as.factor"
  )
    
  for (i in 1:l) {
    joint_all[[i]] <- factor(
      joint_all[[i]], 
      levels = l_lev[[i]]
    )  # funziona
  }
  
  # Ordinamento dei risultati in funzione dei 
  # livelli dei factor appena impostati
  # (.total per primo, ....)
  joint_all <- arrange2_(joint_all, .variables)
  
  return(joint_all)
}



# Computations for the cube creation ------------
joint_all_ <- function(.data, 
                       .variables, 
                       .fun = jointfun_, 
                       .total = "Totale",
                       .all = TRUE, ...) {
  l <- length(.variables)
  m_comb <- combn_l(l)
  
  joint_all <- vector(mode = "list", length = l)
  
  pb <- utils::txtProgressBar(style = 3, max = l)
  for (i in 1:l) {
    joint <- vector(mode = "list", length = ncol(m_comb[[i]]))
    
    for (j in seq_along(joint)) {
      joint[[j]] <- .fun(
        .data, .variables[m_comb[[i]][ , j]], ...
      ) # ...
    }
    
    joint_all[[i]] <- joint
    
    utils::setTxtProgressBar(pb, i)
  }
  close(pb)
  
  # trasforma una nested list in una lista classica
  # joint_all <- dplyr::combine(joint_all)
  joint_all <- unlist(joint_all, recursive = FALSE)
  
  # Aggiunge le colonne mancanti alle distribuzioni marginali e 
  # joint-conditionals (impostando il valore alla stringa .total). 
  for (k in seq_along(joint_all)) {
    vars_missing <- setdiff(.variables, colnames(joint_all[[k]]))
    joint_all[[k]][ , vars_missing] <- .total
  }
  
  # Unisce i data frame della lista joint_all in 
  # uno solo (uno sotto l'altro)
  joint_all <- dplyr::ungroup(
    dplyr::bind_rows(joint_all)
  )
  
  if (.all) {
    joint <- summarise2_(.data, n = ~n(), ...)
    
    joint[, .variables] <- .total  
    
    joint_all <- dplyr::ungroup(
      dplyr::bind_rows(
        joint, joint_all
      )
    )
  }
  
  return(joint_all)
}



remove_total <- function(.cube, .variables, .total = "Totale") {
  tmp <- paste0(.variables, " == '", .total, "'")
  dots <- paste(tmp, collapse = " & ")
  dots <- paste0("!(", dots, ")")
  filter2_(.cube, .dots = dots)
}




#' @param .total character string with the name to give to the subset of data
#'  that includes all the observations of a variable (default: \code{"Totale"}).
#' @param .all logical, indicating if functions' have to be evaluated on the 
#'   complete dataset.
#'  
#' @examples 
#' # dcc5 works like dcc2, but has an additional optional argument, .total,
#' # that can be added to give a name to the groups that include all the 
#' # observations of a variable.
#' tmp5 <- dcc5(
#'   .data = invented_wages, 
#'   .variables = c("gender", "education"),
#'   .fun = jointfun_,
#'   .total = "TOTAL",
#'   order_type = extract_unique2
#' )
#' tmp5
#' 
#' @rdname dcc
#' @export
dcc5 <- function(.data, 
                 .variables, 
                 .fun = jointfun_, 
                 .total = "Totale", 
                 order_type = extract_unique4, 
                 .all = TRUE, 
                 ...) {
  # Data preparation, before the computations for the cube creation
  d <- prepare_data(
    .data = .data, 
    .variables = .variables, 
    .total = .total, 
    order_type = order_type
  )
  
  # Computations for the cube creation
  l <- d[["l"]]; data_new <- d[["data_new"]]; l_lev <- d[["l_lev"]]
  
  joint_all <- joint_all_(
    data_new, 
    .variables = .variables, 
    .fun = .fun, 
    .total = .total, 
    .all = .all, 
    ...
  )
  
  # Last operations, after the creation of the cube.
  # Reordering columns, creating factors, arranging rows, ...
  joint_all_final <- finish_cube(
    joint_all = joint_all, 
    .variables = .variables,
    l_lev = l_lev, 
    l = l
  )
  
  attributes(joint_all_final)[[".variables"]] <- .variables
  
  if (!.all) {
    joint_all_final <- remove_total(
      joint_all_final, 
      .variables = .variables,
      .total = .total
    )
  }
  
  return(joint_all_final)
}








# Argument .fun is replaced by argument .funs_list, which is a list in the form 
# list(n = ~n()). In addition, argument "..." is removed. 
# The .funs_list argument will contain all the statistics to be estimated in 
# the data cube.
# For example, 
# .funs_list = list(
#   n = ~n(), 
#   p50 = ~wq(wage, sample_weights, probs = 0.5), 
#   p25 = ~wq(wage, sample_weights, probs = 0.25))

# Computations for the cube creation
joint_all_funs_ <- function(.data, 
                            .variables, 
                            .funs_list = list(n = ~dplyr::n()), 
                            .total = "Totale", 
                            .all = TRUE,
                            showProgress = TRUE) {
  l <- length(.variables)
  m_comb <- combn_l(l)
  
  joint_all <- vector(mode = "list", length = l)
  
  if (showProgress) {
    pb <- utils::txtProgressBar(style = 3, max = l)
  }
  
  for (i in 1:l) {
    joint <- vector(mode = "list", length = ncol(m_comb[[i]]))
    
    for (j in seq_along(joint)) {
      joint[[j]] <- gby_(
        .data, .variables[m_comb[[i]][, j]]
      )
      joint[[j]] <- summarise2_dots_(
        joint[[j]], .funs_list
      )
      joint[[j]] <- stats::na.omit(joint[[j]])
    }
    
    if (showProgress) utils::setTxtProgressBar(pb, i)
    
    joint_all[[i]] <- joint
  }
  if (showProgress) close(pb)

  # trasforma una nested list in una lista classica  
  # joint_all <- dplyr::combine(joint_all)
  joint_all <- unlist(joint_all, recursive = FALSE)

  # Aggiunge le colonne mancanti alle distribuzioni marginali e 
  # joint-conditionals
  # (impostando il valore a "Totale"). 
  for (k in seq_along(joint_all)) {
    vars_missing <- setdiff(.variables, colnames(joint_all[[k]]))
    joint_all[[k]][, vars_missing] <- .total
  }
  
  # Unisce i data frame della lista joint_all 
  # in uno solo (uno sotto l'altro ...)
  joint_all <- dplyr::ungroup(
    dplyr::bind_rows(joint_all)
  )
  
  if (.all) {
    joint <- summarise2_dots_(
      .data, .funs_list
    )
    
    joint[, .variables] <- .total
    
    joint_all <- dplyr::ungroup(
      dplyr::bind_rows(joint, joint_all)
    )
  }
  
  return(joint_all)
}




#' Data cube creation
#' 
#' @param .data data frame to be processed.
#' @param .variables variables to split data frame by, as a character vector 
#'     (\code{c("var1", "var2")}).
#' @param .funs_list a list of function calls in the form of right-hand formula.
#' @param .total character string with the name to give to the subset of data
#'  that includes all the observations of a variable (default: \code{"Totale"}).
#' @param order_type a function like \code{\link{extract_unique}} or 
#'    \code{\link{extract_unique2}}.
#' @param .all logical, indicating if functions have to be evaluated on the 
#'   complete dataset.
#'   
#' @examples 
#' dcc6(
#'   invented_wages,
#'   .variables = c("gender", "sector"), 
#'   .funs_list = list(n = ~dplyr::n()),
#'   .all = TRUE
#' )
#'      
#' dcc6(
#'   invented_wages,
#'   .variables = c("gender", "sector"), 
#'   .funs_list = list(n = ~dplyr::n()),
#'   .all = FALSE
#' )
#' 
#' @export
dcc6 <- function(.data, 
                 .variables, 
                 .funs_list = list(n = ~dplyr::n()), 
                 .total = "Totale", 
                 order_type = extract_unique4, 
                 .all = TRUE) {
  # Data preparation, before the computations for the cube creation
  d <- prepare_data(
    .data = .data, 
    .variables = .variables, 
    .total = .total, 
    order_type = order_type
  )
  
  # Computations for the cube creation
  l <- d[["l"]]; data_new <- d[["data_new"]]; l_lev <- d[["l_lev"]]
  
  joint_all <- joint_all_funs_(
    data_new, 
    .variables = .variables, 
    .funs_list = .funs_list, 
    .total = .total, .all = .all
  )
  
  # Last operations, after the creation of the cube.
  # Reordering columns, creating factors, arranging rows, ...
  joint_all_final <- finish_cube(
    joint_all = joint_all, 
    .variables = .variables,
    l_lev = l_lev, 
    l = l
  )
  
  attributes(joint_all_final)[[".variables"]] <- .variables
  
  if (!.all) {
    joint_all_final <- remove_total(
      joint_all_final, 
      .variables = .variables,
      .total = .total
    )
  }
  
  return(joint_all_final)
}




# Computations for the cube creation
# With the choice of combinations of variables
joint_all_funs2_ <- function(.data, 
                             .list_variables, 
                             .funs_list = list(n = ~n()), 
                            .total = "Totale", 
                            .all = TRUE) {
  l <- length(.list_variables)
  # m_comb <- combn_l(l)
  
  joint_all <- vector(mode = "list", length = l)
  
  for (i in 1:l) {
    joint_all[[i]] <- gby_(
      .data, .list_variables[[i]]
    )
    joint_all[[i]] <- summarise2_dots_(
      joint_all[[i]], .funs_list
    )
    joint_all[[i]] <- stats::na.omit(joint_all[[i]])
  }
  
  # trasforma una nested list in una lista classica  
  # joint_all <- dplyr::combine(joint_all)
  
  # Aggiunge le colonne mancanti alle distribuzioni 
  # marginali e joint-conditionals
  # (impostando il valore a "Totale"). 
  .variables <- unique(Reduce(c, .list_variables))
  for (k in seq_along(joint_all)) {
    vars_missing <- setdiff(.variables, colnames(joint_all[[k]]))
    joint_all[[k]][ , vars_missing] <- .total
  }
  
  # Unisce i data frame della lista joint_all in 
  # uno solo (uno sotto l'altro ...)
  joint_all <- dplyr::bind_rows(joint_all)
  
  if (.all) {
    joint <- summarise2_dots_(.data, .funs_list)
    
    joint[, .variables] <- .total
    joint_all <- dplyr::bind_rows(joint, joint_all)
  }
  
  return(joint_all)
}


# Last operations, after the creation of the cube.
# Reordering columns, creating factors, arranging rows, ...
finish_cube2 <- function(joint_all, .variables, l_lev, l) {
  # Reorder columns
  dots_all <- colnames(joint_all)
  measures <- setdiff(dots_all, .variables)
  dots_all <- c(.variables, measures)
  joint_all <- select2_(joint_all, dots_all)
  
  # Completamento delle righe per le quali non 
  # esistono combinazioni di variabili nei dati
  # joint_all <- joint_all |> 
  #   tidyr::complete_(cols = .variables, fill = as.list(rep(NA, l)))
  
  # Trasformazione delle .variables in factor e 
  # assegnamento dei livelli
  # in base a l_lev
  options(drop = FALSE)
  
  joint_all[, .variables] <- dplyr::mutate_all(
    joint_all[, .variables],
    .funs = "as.factor"
  )
  
  for (i in 1:l) {
    joint_all[[i]] <- factor(
      joint_all[[i]], levels = l_lev[[i]]
    )  # funziona
  }
  
  # Ordinamento dei risultati in funzione dei 
  # livelli dei factor appena impostati
  # (.total per primo, ....)
  joint_all <- arrange2_(joint_all, .variables)
  
  return(joint_all)
}


dcc7 <- function(.data, 
                 .list_variables, 
                 .funs_list = list(n = ~n()), 
                 .total = "Totale", 
                 order_type = extract_unique4, 
                 .all = TRUE) {
  # Data preparation, before the computations for the cube creation
  .variables <- unique(Reduce(c, .list_variables))
  
  d <- prepare_data(
    .data = .data, 
    .variables = .variables, 
    .total = .total, 
    order_type = order_type
  )
  
  # Computations for the cube creation
  l <- d[["l"]]; data_new <- d[["data_new"]]; l_lev <- d[["l_lev"]]
  
  joint_all <- joint_all_funs2_(
    data_new, 
    .list_variables = .list_variables, 
    .funs_list = .funs_list, 
    .total = .total, .all = .all
  )
  
  # Last operations, after the creation of the cube.
  # Reordering columns, creating factors, arranging rows, ...
  joint_all_final <- finish_cube2(
    joint_all = joint_all, 
    .variables = .variables,
    l_lev = l_lev, 
    l = l
  )
  
  return(joint_all_final)
}