R/utils.R
In gustave: A User-Oriented Statistical Toolkit for Analytical Variance Estimation

Documented in add_zero sum_by

#' Efficient by-group (weighted) summation
#' 
#' @description \code{sum_by} performs an efficient and optionally weighted 
#' by-group summation by using linear algebra and the Matrix package 
#' capabilities. The by-group summation is performed through matrix cross-product
#' of the \code{y} parameter (coerced to a matrix if needed) with a (very) sparse
#' matrix built up using the \code{by} and the (optional) \code{w} parameters. 
#' 
#' Compared to base R, dplyr or data.table alternatives, this implementation 
#' aims at being easier to use in a matrix-oriented context and can yield 
#' efficiency gains when the number of columns becomes high.
#' 
#' @param y A (sparse) vector, a (sparse) matrix or a data.frame. 
#' The object to perform by-group summation on. 
#' @param by The factor variable defining the by-groups. Character variables
#' are coerced to factors.
#' @param w The optional row weights to be used in the summation. 
#' @param na_rm Should \code{NA} values in \code{y} be removed (ie treated as 0 in the summation) ? 
#' Similar to \code{na.rm} argument in \code{\link[base]{sum}}, but \code{TRUE} by default. 
#' If \code{FALSE}, \code{NA} values in \code{y} produce \code{NA} values in the result.
#' @param keep_sparse When \code{y} is a sparse vector or a sparse matrix, should the result
#' also be sparse ? \code{FALSE} by default. As \code{\link[Matrix]{sparseVector-class}} does
#' not have a name attribute, when \code{y} is a sparseVector the result does not have any
#' name (and a warning is cast).
#' 
#' @return A vector, a matrix or a data.frame depending on the type of \code{y}. If \code{y} is
#' sparse and \code{keep_sparse = TRUE}, then the result is also sparse (without names
#' when it is a sparse vector, see keep_sparse argument for details).
#' 
#' @author Martin Chevalier
#' 
#' @examples # Data generation
#' set.seed(1)
#' n <- 100
#' p <- 10
#' H <- 3
#' y <- matrix(rnorm(n*p), ncol = p, dimnames = list(NULL, paste0("var", 1:10)))
#' y[1, 1] <- NA
#' by <- letters[sample.int(H, n, replace = TRUE)]
#' w <- rep(1, n)
#' w[by == "a"] <- 2
#' 
#' # Standard use
#' sum_by(y, by)
#' 
#' # Keeping the NAs
#' sum_by(y, by, na_rm = FALSE)
#' 
#' # With a weight
#' sum_by(y, by, w = w)
#' 
#' @export
#' @import Matrix

sum_by <- function(y, by, w = NULL, na_rm = TRUE, keep_sparse = FALSE){

  # y <- V
  
  # Type of y
  class_y <- class(y)
  is_data.frame_y <- is.data.frame(y)
  if(is_data.frame_y) y <- as.matrix(y)
  is_sparse_y <- inherits(y, c("Matrix", "sparseVector"))
  is_vector_y <- is.null(dim(y))
  is_numeric_y <- is.numeric(if(!is_sparse_y) y else y@x)
  if(!is_numeric_y) stop("y is not numeric (or not entirely).")
  if(!is_sparse_y | is_vector_y) y <- methods::as(y, "sparseMatrix")
  
  # Weight, NA in y
  if(is.null(w)) w <- rep(1, NROW(y))
  if(!is.numeric(w)) stop("w is not numeric")
  if(na_rm) y[is.na(y)] <- 0
  
  # NA in by
  NA_in_by <- is.na(by)
  if(any(NA_in_by)){
    y <- y[!NA_in_by, , drop = FALSE]
    by <- by[!NA_in_by]
    w <- w[!NA_in_by]
  }
  
  # Matrix cross-product
  by <- as.factor(by)
  x <- make_block(w, by)
  colnames(x) <- levels(by)
  r <- crossprod(x, y)

  # Type of r
  if(!is_sparse_y | !keep_sparse){
    r <- if(is_vector_y) stats::setNames(as.vector(r), rownames(r)) else as.matrix(r)
  }else{
    if(is_vector_y) warn("sparseVector can't have names, hence the output won't have names.")
    r <- methods::as(r, class_y)
  }
  if(is_data.frame_y) r <- as.data.frame(r)
  
  r
  
}


#' Expand a matrix or a data.frame with zeros based on rownames matching
#'
#' @description For a given two-dimensional object with rownames and a character
#'   vector, \code{add_zero} produces a corresponding object whose rownames match
#'   the character vector, with zeros on the additional rows.
#'
#'   This function is an easy-to-use and reliable way to reintroduce
#'   non-responding units in the variance estimation process (after the
#'   non-response phase is taken into account).
#'
#' @param y A (sparse) matrix or a data.frame. The object to add zeros to.
#' @param rownames A character vector (other types are coerced to character).
#'   The character vector giving the rows of the produced object.
#' @param remove Should rows of \code{y} whose name do not appear in the rownames
#'   argument be removed ? TRUE by default, a warning is shown when rows are
#'   removed.
#'
#' @return A (sparse) matrix or data.frame depending on the type of \code{y}.
#'
#' @author Martin Chevalier
#'
#' @examples # Data generation
#' set.seed(1)
#' n <- 10
#' p <- 2
#' y <- matrix(1:(n*p), ncol = p, dimnames = list(sample(letters, n)))
#' y[c(3, 8, 12)] <- NA
#' rownames <- letters
#'
#' # Standard use
#' add_zero(y, rownames)
#'
#' # Use when rownames in y do not match
#' # any element in the rownames argument
#' rownames(y)[1:3] <- toupper(rownames(y)[1:3])
#' add_zero(y, rownames)
#' add_zero(y, rownames, remove = FALSE)
#'
#' @import Matrix
#' @export
#' 
add_zero <- function(y, rownames, remove = TRUE){
  
  # y <- m; rownames <- letters
  
  # Type of y
  class_y <- class(y)
  is_data.frame_y <- is.data.frame(y)
  if(is_data.frame_y) y <- as.matrix(y)
  if(is.null(dim(y)))
    stop("y must be a (sparse) matrix or a data.frame.")
  if(is.null(rownames(y)))
     stop("y must have rownames in order to be used in add_zero().")
  is_sparse_y <- inherits(y, c("Matrix", "sparseVector"))
  is_numeric_y <- is.numeric(if(!is_sparse_y) y else y@x)
  if(!is_numeric_y) stop("y is not numeric (or not entirely).")
  
  # Prepare rownames argument
  rownames <- rownames[!is.na(rownames)]
  rownames <- as.character(rownames)
  
  # Expand y with 0 in order to get an object whose rownames 
  # are the character argument rownames (in the same order)
  compl <- setdiff(rownames, rownames(y))
  if(!is_sparse_y){
    r <- rbind(y, matrix(0, nrow = length(compl), ncol = NCOL(y), dimnames = list(compl)))
    if(is_data.frame_y) r <- as.data.frame(r)
  }else{
    r <- rbind(y, Matrix(0, nrow = length(compl), ncol = NCOL(y), dimnames = list(compl, NULL)))
  }

  # Remove rows that do not match any element in rownames 
  # if remove is TRUE
  if(remove){
    if(length(setdiff(rownames(y), rownames)))
      warn("The name of some rows in y do not match any element in the rownames argument. These rows are removed from the result (use remove = FALSE to change this behaviour).")
    o <- rownames    
  }else o <- order(rownames(r))
  
  r[o, , drop = FALSE]
  
}


# TODO: Export and document make_block()
make_block <- function(y, by){
  
  # Step 1: Prepare the by argument
  by <- droplevels(as.factor(by))
  H <- length(levels(by))
  if(H == 1) return(y)
  
  # Step 2: Coerce y to a TsparseMatrix and remove NA values
  res <- coerce_to_TsparseMatrix(y)
  if(any(is.na(by))){
    na <- is.na(res@j)
    res@x <- res@x[!na]
    res@i <- res@i[!na]
    res@j <- res@j[!na]
  }
  
  # Step 3: Adjust the y and Dim slots in order to obtain the block matrix
  p <- NCOL(res)
  res@Dimnames[2] <- list(NULL)
  res@j <- as.integer(((as.numeric(by) - 1) * p)[res@i + 1] + res@j)
  res@Dim <- c(res@Dim[1], as.integer(res@Dim[2] * H))
  
  # Step 4: Export the result with relevant attributes
  attr(res, "rowby") <- as.character(by)
  attr(res, "colby") <- as.character(rep(levels(by), each = p))
  res
  
}


# Unexported (and undocumented) functions

# From devtools (https://github.com/r-lib/devtools/blob/master/R/utils.r)
"%||%" <- function(a, b) if (!is.null(a)) a else b

coerce_to_TsparseMatrix <- function(y){
  if(is.null(dim(y))){
    names_y <- names(y)
    res <- Matrix::sparseMatrix(
      x = unname(y), i = seq_along(y), j = rep(1, length(y)), repr = "T"
    )
    if(!is.null(names_y)) rownames(res) <- names_y
  }else if(!methods::is(y,"TsparseMatrix")){
    dimnames_y <- dimnames(y)
    res <- methods::as(y, "TsparseMatrix")
    if(!is.null(dimnames_y)) dimnames(res) <- dimnames_y
  }else res <- y
  res
}

detect_block <- function(y, by){
  by <- droplevels(as.factor(by))
  y_bool <- coerce_to_TsparseMatrix(y) != 0
  by_bool <- make_block(rep(TRUE, NROW(y)), by)
  prod <- crossprod(by_bool, y_bool)
  prod_bool <- prod > 0
  if(!all(colSums(prod_bool) <= 1)) return(NULL)
  attr(y, "rowby") <- as.character(by)
  attr(y, "colby") <- rep(levels(by), NCOL(prod_bool))[as.vector(prod_bool)]
  y
}


change_enclosing <- function(FUN, envir = environment(FUN)){
  eval(parse(text = deparse(FUN)), envir = envir)
}

assign_all <- function(objects, to, from = parent.frame(), not_closure = c(list(globalenv()), sys.frames())){
  for(n in objects){
    get_n <- get(n, from)
    if(!is.function(get_n)){
      assign(n, get_n, envir = to)
    }else{
      tmp <- new.env(parent = to)
      env_n <- environment(get_n)
      not_closure <- c(not_closure, from)
      is_closure <- !any(sapply(not_closure, identical, env_n))
      if(is_closure)
        assign_all(ls(env_n, all.names = TRUE), to = tmp, from = env_n, not_closure = not_closure)
      assign(n, change_enclosing(get_n, envir = tmp), envir = to)
    }
  }
}

is_error <- function(expr) 
  inherits(try(expr, silent = TRUE), "try-error")

is_variable_name <- function(param, data = NULL, max_length = 1)
  is.character(param) && 
  (is.null(data) || length(setdiff(param, names(data))) == 0) && 
  length(param) > 0 && length(param) <= max_length

variable_not_in_data <- function(var, data){
  if(is.null(var)) return(NULL)
  tmp <- var[!(var %in% names(data))]
  if(length(tmp) == 0) return(NULL)
  tmp
} 

replace_variable_name_with_symbol <- function(arg_list, data, single = TRUE){
  # TODO: Allow consistent evaluation through parent frames
  # TODO: Handle the case of apparent name without match in data variable names
  tmp <- lapply(arg_list, function(a){
    if(is_error(a_eval <- eval(a, envir = data))){
      a_out <- list(a)
    }else if(is_variable_name(a_eval, data = data, max_length = Inf)){
      if(single && !is_variable_name(a_eval, data = data, max_length = 1))
        stop("Only single variable names are allowed for the by argument.")
      a_out <- lapply(a_eval, as.symbol)
    }else a_out <- list(a)
    a_out
  })
  if(!single){
    tmp_length <- sapply(tmp, length)
    if(!all(tmp_length %in% c(1, max(tmp_length))))
      stop("Some arguments have longer variable vectors than others.")
    tmp[tmp_length == 1] <- 
      lapply(tmp[tmp_length == 1], `[`, rep(1, max(tmp_length)))
  }else if(length(tmp) == 1) tmp[1] <- tmp[[1]]
  tmp
}

warn <- function(...) warning(..., "\n", call. = FALSE, immediate. = TRUE)
note <- function(...) message("Note: ", ..., "\n")

is_statistic_wrapper <- function(x) inherits(x, "gustave_statistic_wrapper")

names_else_NA <- function(x){
  if(is.null(names(x))) rep(NA, length(x)) else{
    tmp <- names(x)
    tmp[tmp %in% ""] <- NA
    tmp
  }
}

discretize_qualitative_var <- function(var, logical  = FALSE){
  var <- droplevels(as.factor(var))
  result <- Matrix(nrow = length(var), ncol = length(levels(var)))
  if(length(levels(var)) == 1){
    result[!is.na(var), ] <- 1L
  }else{
    result[!is.na(var), ] <- Matrix::sparse.model.matrix(~ var - 1)  
  }
  result[is.na(var), ] <- NA
  if(!logical) result <- result * 1
  rownames(result) <- names(var)
  colnames(result) <- levels(var)
  result
}

get_through_parent_frame <- function(x){
  n <- 0
  found <- NULL
  while(is.null(found) || identical(baseenv(), parent.frame(n))){
    n <- n + 1
    found <- get0("execution_envir", parent.frame(n))
  }
  found
}

display_only_n_first <- function(x, 
                                 n = 10, 
                                 collapse = ", ", 
                                 final_text = paste0(" and ", length(x) - n, " more")
){
  if(length(x) <= n){
    paste(x, collapse = collapse)
  }else{
    paste0(paste(x[1:n], collapse = collapse), final_text)
  }
}

rbind_output_df <- function(list_output_df){
  names <- unique(do.call(base::c, lapply(list_output_df, names)))
  output_df <- do.call(rbind, lapply(list_output_df, function(i){
    i[, setdiff(names, names(i))] <- NA
    i[, names]
  }))
  output_df <- output_df[, sapply(output_df, function(i) !all(is.na(i)))]
  rownames(output_df) <- NULL
  output_df
}