R/max_contribution.R

Defines functions n_contributors max_contribution

Documented in max_contribution n_contributors

#' Find Major Contributions to Aggregates and Count Contributors
#'
#' These functions analyze contributions to aggregates, assuming that the aggregates are calculated 
#' using a dummy matrix with the formula: \code{z = t(x) \%*\% y}.
#' 
#' The `max_contribution` function identifies the largest contributions to these aggregates, while 
#' the wrapper function `n_contributors` is designed specifically to count the number of contributors 
#' for each aggregate. 
#'
#' @param x A (sparse) dummy matrix 
#' @param y A numeric vector of input values (contributions).
#' @param n Integer. The number of largest contributors to identify for each aggregate.
#'          Default is 1.
#' @param id An optional vector for grouping. When non-NULL, major contributions are 
#'           found after aggregation within each group specified by `id`. 
#'           Aggregates with missing `id` values are excluded.
#' @param output A character vector specifying the desired output. Possible values:
#'   - `"y"`: A matrix with the largest contributions in the first column, the second largest in the second column, and so on.
#'   - `"id"`: A matrix of IDs associated with the largest contributions. If an `id` vector is provided, it returns these IDs; otherwise, it returns indices.
#'   - `"n_contr"`: An integer vector indicating the number of contributors to each aggregate.
#'   - `"n_0_contr"`: An integer vector indicating the number of contributors that contribute a value of 0 to each aggregate.
#'   - `"n_non0_contr"`: An integer vector indicating the number of contributors that contribute a nonzero value to each aggregate.
#'   - `"sums"`: A numeric vector containing the aggregate sums of `y`.
#'   - `"n_contr_all"`, `"n_0_contr_all"`, `"n_non0_contr_all"`, `"sums_all"`: 
#'         Same as the corresponding outputs above, but without applying the `remove_fraction` parameter.
#' @param drop Logical. If TRUE (default) and `output` has length 1, 
#'   the function returns the single list element directly instead of a list containing one element. 
#' @param decreasing Logical. If TRUE (default), finds the largest contributors. 
#'                   If FALSE, finds the smallest contributors.
#' @param remove_fraction A numeric vector containing values in the interval `[0, 1]`, specifying contributors to be removed when identifying the largest contributions. 
#'   - If an `id` vector is provided, `remove_fraction` must be named according to the IDs of the contributors to be removed.
#'   - If no `id` vector is provided, the length of `remove_fraction` must match the length of `y`. In this case, contributors not to be removed should have a value of `NA` in `remove_fraction`.
#'   - The actual values in `remove_fraction` are used for calculating `"sums"` (see description above).
#' @param do_abs Logical. If TRUE (default), uses the absolute values of the summed contributions. 
#'   The summation is performed for all contributions from the same contributor, 
#'   within each aggregate being computed.
#'
#' @return A list or a single element, depending on the values of the `output` and `drop` parameters.
#' 
#' @export
#' @importFrom Matrix Diagonal
#' @importFrom methods new
#' 
#' @examples
#' 
#' z <- SSBtoolsData("magnitude1")
#' a <- ModelMatrix(z, formula = ~sector4 + geo, crossTable = TRUE)
#' 
#' cbind(a$crossTable, 
#'       y =  max_contribution(x = a$modelMatrix, y = z$value, n = 2), 
#'       id = max_contribution(x = a$modelMatrix, y = z$value, n = 2, output = "id"),
#'       n =  n_contributors(  x = a$modelMatrix, y = z$value, n = 2))
#' 
#' cbind(a$crossTable, 
#'       y = max_contribution(x = a$modelMatrix, y = z$value, n = 3, id = z$company), 
#'       id = max_contribution(a$modelMatrix, z$value, 3, id = z$company, output = "id"))
#' 
#' max_contribution(x = a$modelMatrix, 
#'                  y = z$value, 
#'                  n = 3, 
#'                  id = z$company, 
#'                  output = c("y", "id", "n_contr", "sums"))
#' 
#' as.data.frame(
#'   max_contribution(x = a$modelMatrix, 
#'                    y = z$value, 
#'                    n = 3, 
#'                    id = z$company, 
#'                    output = c("y", "id", "n_contr", "sums", "n_contr_all", "sums_all"), 
#'                    remove_fraction = c(B = 1)))
#' 
max_contribution <- function(x, 
                             y, 
                             n = 1, 
                             id = NULL, 
                             output = "y", 
                             drop = TRUE,
                             decreasing = TRUE,
                             remove_fraction = NULL,
                             do_abs = TRUE) {
  
  
  out_names <- c("y", "id", 
                 "n_contr",     "n_0_contr",     "n_non0_contr",     "sums", 
                 "n_contr_all", "n_0_contr_all", "n_non0_contr_all", "sums_all")
  out <- vector("list", length( out_names))
  names(out) <-  out_names
  
  output <- names(out) %in% output
  names(output) <- names(out)
  
  try_return <- function() {
    ok_out <- !sapply(out, is.null)
    if (all(ok_out[output])) {
      if (drop & sum(output) == 1) {
        return(out[[which(output)]])
      }
      return(out[which(output)])
    }
    NULL
  }
  
  
  if (!is.null(remove_fraction)) {
    if (length(remove_fraction[!is.na(remove_fraction)])) {
      if (min(remove_fraction, na.rm = TRUE) < 0 | max(remove_fraction, na.rm = TRUE) >
          1) {
        stop("remove_fraction must be within the interval [0, 1]")
      }
    }
  }
  
  if (is.null(id)) {
    id <- seq_len(nrow(x))
    fid <- id
    id_input <- FALSE
    if (!is.null(remove_fraction)) {
      if (length(remove_fraction) != nrow(x)) {
        stop("wrong length of remove_fraction")
      }
    }
  } else {
    id_input <- TRUE
    if (!is.null(remove_fraction)) {
      if (!length(names(remove_fraction))) {
        stop("remove_fraction must be named")
      }
      if (any(duplicated(names(remove_fraction)))) {
        stop("duplicated remove_fraction names")
      }
      if (!all(names(remove_fraction) %in% unique(id))) {
        warning("remove_fraction names not in id")
      }
    }
    if (length(id) != nrow(x)) {
      stop("Incorrect length of id")
    }
    
    if (anyNA(id)) {
      rows <- !is.na(id)
      id <- id[rows]
      x <- x[rows, , drop = FALSE]
      y <- y[rows]
    }
    
    fid <- factor(id)
    id <- as.integer(fid)
    fid <- levels(fid)
    
    if (!is.null(remove_fraction)) {
      remove_fraction_input <- remove_fraction
      remove_fraction <- rep(NA_real_, max(id))
      ma <- match(fid, names(remove_fraction_input))
      remove_fraction[!is.na(ma)] <- remove_fraction_input[ma[!is.na(ma)]]
    }
    
  }
  
  if (!is.null(remove_fraction)) {
    keep <- is.na(remove_fraction)
    remove_fraction[keep] <- 0
  }
  
  
  xT <- As_TsparseMatrix(x) 
  
  if (id_input) {
    gT <- new("dgTMatrix", i = 0:(nrow(x) - 1L), j = id - 1L, x = y, Dim = c(nrow(xT), max(id)))
    gT <- As_TsparseMatrix(crossprod(gT, xT),  do_drop0 = FALSE)
    if(do_abs) {
      gT <- abs(gT)
    }
    xM <- data.frame(y = gT@x, col = gT@j + 1, gr = gT@i + 1)
  } else {  # same but simpler calculation
    if(do_abs) {
      y <- abs(y)
    }
    xM <- data.frame(y = y[xT@i + 1], col = xT@j + 1, gr = id[xT@i + 1])
  }
  
  if (output[["sums_all"]] | (output[["sums"]] & is.null(remove_fraction))) {
    if (id_input) {
      out$sums_all <- colSums(gT)
    } else {
      out$sums_all <- as.matrix(crossprod(xT, y))[, 1]
    }
    if (output[["sums"]] & is.null(remove_fraction)) {
      out$sums <- out$sums_all
    }
  }
  
  if (!is.null(remove_fraction) & output[["sums"]]) {
    if (id_input) {
      gT <- Diagonal(x = 1-remove_fraction)  %*% gT
      out$sums <- colSums(gT) 
    }
    else {
      out$sums <- as.matrix(crossprod(xT, y * (1-remove_fraction)))[, 1]
    }
  }
  
  if (!is.null(tr <- try_return())) return(tr)
  
  
  if (output[["n_contr_all"]]) {
    out$n_contr_all <- as.vector(table_all_integers(xM[, "col"], ncol(x)))
  }
  if (output[["n_0_contr_all"]]) {
    out$n_0_contr_all <- as.vector(table_all_integers(xM[, "col"][xM[, "y"]==0], ncol(x)))
  }
  if (output[["n_non0_contr_all"]]) {
    out$n_non0_contr_all <- as.vector(table_all_integers(xM[, "col"][xM[, "y"]!=0], ncol(x)))
  }
  
  if (!is.null(tr <- try_return())) return(tr)
  
  
  if (!is.null(remove_fraction)) {
    xM <- xM[keep[xM[, "gr"]], , drop = FALSE] 
  }
  
  xM <- as.matrix(xM)  # Needed since empty index below
  
  xM[, "y"] <- -decreasing * xM[, "y"]
  xM <- SortRows(xM) 
  xM[, "y"] <- -decreasing * xM[, "y"]
  
  seqCol <- seq_len(ncol(x))
  
  if (output[["n_contr"]]) {
    out$n_contr <- as.vector(table_all_integers(xM[, "col"], ncol(x)))
  }
  if (output[["n_0_contr"]]) {
    out$n_0_contr <- as.vector(table_all_integers(xM[, "col"][xM[, "y"]==0], ncol(x)))
  }
  if (output[["n_non0_contr"]]) {
    out$n_non0_contr <- as.vector(table_all_integers(xM[, "col"][xM[, "y"]!=0], ncol(x)))
  }
  
  if (!is.null(tr <- try_return())) return(tr)
  
  
  out$y <- matrix(NA_integer_, ncol(x), n)
  
  if (output[["id"]]) {
    out$id <- matrix(ifelse(id_input, NA_character_, NA_integer_), ncol(x), n)
  }
  
  for (i in seq_len(n)) {
    if (i > 1) {
      xM[ma, "col"] <- 0
    }
    ma <- match(seqCol, xM[, "col"])
    out$y[, i] <- xM[ma, "y"]
    if (output[["id"]]) {
      out$id[, i] <- fid[xM[ma, "gr"]]
    }
  }
  
  if (!is.null(tr <- try_return())) return(tr)
  
  warning("Something went wrong while generating output.")
  
  out
}



#' @rdname max_contribution
#' @param ... Further arguments to `max_contribution`  (used by `n_contributors`).
#' @export
n_contributors <- function(x, 
                           y = rep(1L, nrow(x)), 
                           id = NULL, 
                           output = "n_contr", 
                           ...) {
  max_contribution(x = x, y = y, id = id, output = output, ...)
}

Try the SSBtools package in your browser

Any scripts or data that you put into this service are public.

SSBtools documentation built on April 3, 2025, 5:38 p.m.