R/utils.R

Defines functions translateCounts maeSummary subsetWithMissing applyOverDFList isTRUEorFALSE design2factor mae mse rsq estimateStat applyOverColumnGroups simplifyInteractionMatrix getCoverage

Documented in applyOverColumnGroups applyOverDFList design2factor estimateStat getCoverage isTRUEorFALSE mae maeSummary mse rsq simplifyInteractionMatrix subsetWithMissing translateCounts

#' re-export magrittr pipe operator
#'
#' @name %>%
#' @rdname pipe
#'
#' @importFrom magrittr %>%
#'
#' @export
NULL

# #' Intersect GRanges objects
# #'
# #' \code{intersectGR} intersect two GRanges objects and return ranges from
# #' \code{a} that were overlapped by ranges in \code{b}.
# #'
# #' @param a GRanges object.
# #' @param b GRanges object.
# #' @param ... other arguments internally passed to
# #'   \code{\link[GenomicRanges]{findOverlaps}}.
# #'
# #' @return GRanges object which is a subset of \code{a}.
# #'
# intersectGR <- function(a, b, ...) {
#   warning("deprecated! use IRanges::subsetByOverlaps instead") # TODO drop from the package
#   stopifnot(is(a, "GRanges"))
#   stopifnot(is(b, "GRanges"))
#
#   IRanges::subsetByOverlaps(x = a, ranges = b, ...)
# }

#' Calculate regions coverage
#'
#' \code{getCoverage} calculates coverage of regions (rows in interaction
#' matrix) by features (columns). It is possible to specify features grouping
#' variable \code{gr} then coverage tells how many distinct groups the region
#' overlap with.
#'
#' @param mat dgCMatrix interaction matrix such as produced by
#'   \code{\link{getInteractionMatrix}}.
#' @param gr factor specifying features groups. Must have length equal
#'   to number of columns in \code{mat}.
#'
#' @return Numeric vector.
#'
#' @examples
#' data("remap_mini")
#' y <- colnames(remap_mini)
#'
#' # simple coverage
#' gr <- seq_along(y) %>% as.factor()
#' getCoverage(remap_mini, gr)
#'
#' # per cell type coverage
#' gr <- sub(".*\\.", "", y) %>% as.factor()
#' getCoverage(remap_mini, gr)
#'
#' @importFrom DelayedArray colsum
#'
#' @export
getCoverage <- function(mat, gr) {
  stopifnot("mat must be an instance of class 'dgCMatrix'" = is(mat, "dgCMatrix"))
  stopifnot("gr must be a factor" = is.factor(gr))
  stopifnot("gr length must equal to number of columns in mat" = length(gr) == ncol(mat))

  DelayedArray::colsum(x = mat, group = gr) %>%
    `>=`(1) %>%
    rowSums()
}

#' Simplify Interaction Matrix
#'
#' @param mat dgCMatrix interaction matrix such as produced by
#'   \code{\link{getInteractionMatrix}}.
#' @param alpha Number between 0 and 1 specifying voting threshold. Eg. for 3
#'   column matrix alpha 0.5 will give voting criteria >= 2.
#' @param colname character giving new column name.
#'
#' @return dgCMatrix
#'
#' @importFrom Matrix rowSums sparseMatrix
#'
simplifyInteractionMatrix <- function(mat, alpha = 0.5, colname = NA) {
  stopifnot(is(mat, "dgCMatrix"))
  stopifnot(is.numeric(alpha) && (length(alpha) == 1))
  stopifnot((alpha > 0) && (alpha <= 1))

  crit <- ceiling(ncol(mat) * alpha)
  i <- which(Matrix::rowSums(mat) >= crit)
  Matrix::sparseMatrix(i = i,
                       j = rep(1L, length(i)),
                       x = 1,
                       dims = c(nrow(mat), 1L),
                       dimnames = list(rownames(mat), colname))
}

#' Apply function over groups of columns
#'
#' Returns a array obtained by applying a function to rows of submatrices of the
#' input matrix, where the submatrices are divided into specified groups of
#' columns.
#'
#' @param mat a matrix.
#' @param groups a vector giving columns grouping.
#' @param f function to be applied.
#' @param ... optional arguments to \code{f}.
#'
#' @return a matrix of dimensions \code{nrow(mat) x nlevels(groups)}.
#'
applyOverColumnGroups <- function(mat, groups, f, ...) {
  stopifnot(is.matrix(mat))
  stopifnot(is.vector(groups) || is.factor(groups))
  stopifnot(length(groups) == ncol(mat))
  stopifnot(is.function(f))

  if (is.vector(groups)) {
    groups <- factor(groups)
  }

  group_f <- lapply(X = levels(groups),
                    FUN = function(g) {
                      apply(X = mat,
                            MARGIN = 1,
                            FUN = function(row) f(row[groups == g], ...))
                    })
  new_mat <- do.call(cbind, group_f)
  colnames(new_mat) <- levels(groups)

  return(new_mat)
}

#' Estimate linear models goodness of fit statistic
#'
#' Estimate goodness of fit statistic of penalized linear regression models.
#' Works with different goodness of fit statistic functions.
#'
#' @inheritParams glmnet::glmnet
#' @param u offset vector as in \code{\link{glmnet}}. \code{"U"} experiment in
#'   mae.
#' @param s user supplied lambda.
#' @param method currently only cross-validation is implemented.
#' @param nfold number of fold to use in cross-validation.
#' @param statistic function computing goodness of fit statistic. Should accept
#'   \code{y}, \code{x}, \code{offset} arguments and return
#'   a numeric vector of the same length. See \code{rsq}, \code{mse} for examples.
#' @param ...  Other arguments that can be passed to glmnet.
#'
#' @return numeric vector of \code{statistic} estimates.
#'
#' @importFrom foreach foreach
#'
estimateStat <- function(x, y, u, s, method = "cv", nfold = 10, statistic = rsq, alpha = 0, ...) {
  if (method == "cv") {
    out <- c()
    part <- sample(seq_len(nfold), size = length(y), replace = TRUE)

    out <- foreach::foreach(p_ = seq_len(nfold), .combine = c) %do%
      {
        py <- y[part != p_]
        px <- x[part != p_, ]
        poffset <- u[part != p_, ]
        mod <- glmnet::glmnet(x = px, y = py, offset = poffset, lambda = s, alpha = alpha, ...)

        # evaluate on held-out fold
        py <- y[part == p_]
        px <- x[part == p_, ]
        poffset <- u[part == p_, ]
        yhat <- stats::predict(mod, newx = px, newoffset = poffset, s = s)
        stat <- statistic(py, yhat, px, poffset)
        out <- c(out, stat)
      }
  }

  return(out)
}

# declare estimateStat foreach variables
utils::globalVariables("p_")

#' Calculate $R^2$
#'
#' @param y numeric vector of observed expression values.
#' @param yhat numeric vector of predicted expression values.
#' @param offset numeric vector giving basal expression level.
#'
#' @return numeric vector
#'
rsq <- function(y, yhat, px, offset) {
  y <- y - offset
  yhat <- yhat - offset
  1 - (sum((y - yhat)^2) / (var(y) * (length(y) - 1)))
}

#' Calculate Mean Squared Error
#'
#' @param y numeric vector of observed expression values.
#' @param yhat numeric vector of predicted expression values.
#' @param ... not used.
#'
#' @return numeric vector
#'
mse <- function(y, yhat, ...) mean((y - yhat)^2)

#' Calculate Mean Absolute Error
#'
#' @param y numeric vector of observed expression values.
#' @param yhat numeric vector of predicted expression values.
#' @param ... not used.
#'
#' @return numeric vector
#'
mae <- function(y, yhat, ...) mean(abs(y - yhat))

#' Transform design matrix to factor
#'
#' @param design design matrix
#'
#' @return factor
#'
#' @examples
#' \dontrun{
#' design <- matrix(data = c(1, 1, 0, 0, 0, 0, 1, 1),
#'                  nrow = 4,
#'                  ncol = 2,
#'                  dimnames = list(c(paste("sample", 1:4)), c("gr1", "gr2")))
#' design2factor(design)
#' }
#'
design2factor <- function(design) {
  # based on edgeR::designAsFactor, but jokes aside
  groups <- factor(rowMeans(design * col(design) * ncol(design)))
  samples_to_keep <- groups != 0
  groups <- groups[samples_to_keep] # omit empty groups
  groups <- droplevels(groups)
  levels(groups) <- colnames(design)[colSums(design) > 0]

  return(groups)
}

#' Check if argument is a binary flag
#'
#' @param x object to test
#'
#' @return binary flag
#'
isTRUEorFALSE <- function(x) {
  (length(x) == 1) && is.logical(x) && (! is.na(x))
}

#' Apply function over selected column in list of data frames
#'
#' \code{applyOverDFList} operates on a list of data frames where all data frames
#' has the same size and columns. Column of interest is extracted from each data
#' frame and column binded in \code{groups}, next \code{fun} is applied over
#' rows. Final result is a matrix with result for each group on a separate column.
#' Function is parallelized over groups.
#'
#' @param list_of_df list of \code{data.frame}s.
#' @param col_name string specifying column in \code{data.frame}s to apply
#'   \code{fun} on.
#' @param fun function to apply, should take a single vector as a argument.
#' @param groups factor defining how elements of \code{list_of_df} should be
#'   grouped.
#'
#' @return matrix with \code{nrow(list_of_df[[1]])} rows and
#'   \code{nlevels(groups)} columns.
#'
applyOverDFList <- function(list_of_df, col_name, fun, groups) {
  stopifnot("all list_of_df names must be included in groups" = setequal(names(list_of_df), names(groups)))
  stopifnot("groups must not have unused levels" = setdiff(levels(groups), groups) == character(0))

  col_fun_mat <- foreach::foreach(gr_ = levels(groups), .combine = cbind) %dopar% # parallel version is bit faster
    {
      i <- groups == gr_
      attr <- lapply(list_of_df[i], function(df) df[[col_name]])
      attr <- do.call(cbind, attr)
      matrix(
        data = apply(X = attr, MARGIN = 1, FUN = fun),
        ncol = 1L,
        dimnames = list(rownames(attr), gr_)
      )
    }

  return(col_fun_mat)
}

# declare applyOverDFList foreach variables
utils::globalVariables("gr_")

#' Subset keeping missing
#'
#' Subset matrix keeping unmatched rows as NA.
#'
#' @param mat matrix
#' @param rows character
#'
#' @return a matrix
#'
subsetWithMissing <- function(mat, rows) {
  i <- match(x = rows, table = rownames(mat), nomatch = 0)
  i[i == 0] <- NA
  smat <- mat[i, ]
  rownames(smat) <- rows
  smat
}

#' Helper summarizing MAE object
#'
#' @param mae MultiAssayExperiment object.
#'
#' @return named list giving number of rows and columns, overall mean and
#'   standard deviation in \code{mae}'s experiments.
#'
#' @importFrom MultiAssayExperiment experiments
#' @importFrom Matrix mean
#' @importFrom stats sd
#'
#'
maeSummary <- function(mae) {
  lapply(MultiAssayExperiment::experiments(mae), function(x) c(dim(x), Matrix::mean(x), sd(x)))
}


#' Translate counts matrix rownames
#'
#' \code{translateCounts} renames counts matrix rownames according to supplied
#' \code{dict}ionary. Function can handle many to one assignments by taking a
#' sum or an average over \code{counts} rows. Other types of ambiguous
#' assignments are not supported.
#'
#' @param counts matrix of expression values.
#' @param dict named character vector mapping \code{counts} rownames to new
#'   values. Values of vector should correspond to new desired rownames, and its
#'   names to current rownames.
#'
#' @return matrix of expression values with new rownames.
#'
#' @examples
#' counts <- matrix(
#' data = c(5, 4, 3, 2),
#' nrow = 2,
#'    dimnames = list(
#'      c("ENSG00000130700", "ENSG00000089225"),
#'      c("treatment", "control")
#'    )
#'  )
#' dict <- c(ENSG00000130700 = "GATA5", ENSG00000089225 = "TBX5")
#' translateCounts(counts, dict)
#'
#' @export
translateCounts <- function(counts, dict) {
  stopifnot("counts must be an instance of class 'matrix'" = is(counts, "matrix"))
  stopifnot("counts must have its rownames defined" = ! is.null(rownames(counts)))
  stopifnot("counts rownames must be unique" = ! anyDuplicated(rownames(counts)))
  stopifnot("dict must be an instance of class 'character'" = is(dict, "character"))
  stopifnot("dict must be a named character vector" = ! is.null(names(dict)))
  stopifnot("dict names must be unique" = ! anyDuplicated(names(dict)))
  stopifnot("counts rownames must correspond to dict names" = any(rownames(counts) %in% names(dict)))

  new_counts <- counts[rownames(counts) %in% names(dict), ]
  new_counts <- rowsum(x = new_counts, group = dict[rownames(new_counts)]) # TODO a simple and useful extension would be to implement option to take an average instead of sum, this could be easly done by dividing by the groups size

  return(new_counts)
}
bkaczkowski/xcore documentation built on Jan. 26, 2024, 6:24 p.m.