R/cell_sampler.R

Defines functions cell_sampler sample_cells_apply sample_cells sample_from_levels avg_molecules total_molecules

Documented in avg_molecules cell_sampler sample_cells sample_cells_apply sample_from_levels total_molecules

#' Sample cells and apply function
#'
#' Sample a certain number of cells from a DGE data.frame and apply a specific
#' function to the subset.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @param size Number of cells that should be sampled. If different sizes should
#' be sampled each specified group, \code{size} needs to be a vector of length
#' \code{length(levels(groups))} providing sizes for each group of cells. If
#' \code{groups} is used it's recommendend that \code{size} is a named vector
#' with group ids as names, as this will ensure correct sizes per group.
#'
#' @param fun Function to apply to sampled cells. Needs to take a DGE data.frame
#' as it's first positional argument.
#'
#' @param groups Optional; a factor of the same length as the number of cells
#' providing group assignments for each cell in dge. If \code{groups = NULL} all
#' cells are assigned to the same group.
#'
#' @param n Number of times a sample is drawn and the function is applied.
#'
#' @param simplify Logical or character string specifying if and how the output
#' should be simplified. See help for \link[base]{sapply} for more information.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @param funArgs Optional; a list with additional arguments for the applied
#' function \code{fun}.
#'
#' @return A list of output objects from each sampling iteration of the selected
#' function \code{fun} if \code{simplify = FALSE}. If \code{simplify = TRUE} a
#' vector, matrix, or, if \code{simplify = "array"}, an array if appropriate.
#'
#' @export
cell_sampler <- function(dge, size, fun, groups = NULL, n = 1, simplify = TRUE,
                         replace = TRUE, funArgs = list()){

  # assign all cells to one group if not specified otherwise
  if (is.null(groups)){

    groups <- factor(rep(1, times = ncol(dge) - 1))

  }

  # verify input ---------------------------------------------------------------

  # check that groups is a factor of length equal to the number of cells
  if (!is.factor(groups) | length(groups) != ncol(dge) - 1){

    stop("groups needs to be a factor of length equal to the number of cells")

  }

  # group ids, number of groups and sizes
  group_ids <- levels(groups)
  n_groups <- length(group_ids)
  n_size <- length(size)

  # make sure that each group has a size
  if(n_size == 1){

    # use the same size for all groups if only one size is provided
    size <- rep(size, times = n_groups)

  }else if (n_size != n_groups){

    # abort if more than one size is provided, but the number of sizes do not
    # match the number of groups
    stop("size needs to be of the same length as the number of groups")

  }

  # order size according to group ids, if size has group ids as names
  if (!is.null(names(size))){

    # get group ids present in both groups and size
    common_ids <- intersect(group_ids, names(size))

    # sort size in the same order as group ids
    if (length(common_ids) == n_groups) {

      size <- size[group_ids]

    }else{

      stop("group ids and names of size do not agree")

    }

  }

  # sample cells and apply function n times ------------------------------------

  # build list of arguments for sample_cells_apply()
  argList <- list(dge, size, groups, fun, funArgs, replace)

  # sample cells and apply function fun n times using sample_cells_apply()
  sapply(1:n,
         FUN = function(i, args){

           # call sample_cells_apply() with argument list args
           do.call(sample_cells_apply, args)

         },
         args = argList,
         simplify = simplify)

}

#' Sample cells and apply function
#'
#' Sample a certain number of cells from a DGE data.frame and apply a specific
#' function to the subset.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @param sizes Number of cells that should be sampled from each group.
#'
#' @param groups A factor of length \code{ncol(dge) - 1} providing group
#' assignments for each cell in dge.
#'
#' @param fun Function to apply to sampled cells.
#'
#' @param funArgs A list with additional arguments for the applied function
#' \code{fun}.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @return Output of function \code{fun} applied to sampled cells.
sample_cells_apply <- function(dge, sizes, groups, fun, funArgs,
                               replace = TRUE){

  # sample cells from each group of cells
  dge_sample <- sample_cells(dge, sizes = sizes, groups = groups,
                             replace = replace)

  # build arguments list for function fun (basically upack funArgs)
  args <- c(list(dge_sample), funArgs)

  # apply function to sampled cells
  do.call(fun, args)

}

#' Randomly sample cells
#'
#' Sample a specific numbers of cells different subgroups of cells in a DGE
#' data.frame.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @param sizes Number of cells that should be sampled from each group.
#'
#' @param groups A factor of length \code{ncol(dge) - 1} providing group
#' assignments for each cell in dge.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @return A data.frame with sampled DGE data in the same format as \code{dge}.
sample_cells <- function(dge, sizes, groups, replace = TRUE){

  # randomly select indices of cells (cols in dge) from each group (levels)
  cells <- sample_from_levels(groups, sizes = sizes,
                              replace = replace) + 1  # + 1 for gene id in dge

  # get dge for these cells
  dge[, c(1, cells)]

}

#' Sample from levels of a factor
#'
#' Randomly select a specified number elements from each level (group) of a
#' factor.
#'
#' @param x A factor from whose levels should be sampled.
#'
#' @param sizes Number of elements that should be sampled from each level of
#' \code{x}. Assumed to have the same length as the number of elements.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @return An numeric vector containing indices of the elements sampled from
#' each level.
sample_from_levels <- function(x, sizes, replace = TRUE){

  # function to sample from a specific level of a vector
  sample_from_level <- function(level, size, vector, replace){

    # get indices of elements of a specific level
    indices <- which(vector == level)

    # sample from these indices
    sample(indices, size = size, replace = replace)

  }

  # sample from each level of x
  sampled_x <- mapply(FUN = sample_from_level, levels(x), sizes,
                      MoreArgs = list(x, replace),
                      SIMPLIFY = FALSE)

  # remove names
  names(sampled_x) <- NULL

  # convert to one vector
  unlist(sampled_x)

}

# example functions to apply ---------------------------------------------------

#' Average molecules per gene
#'
#' Calculate the average number of molecules (UMIs) per gene across all cells.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @return A named vector containing average UMI counts per gene.
#'
#' @export
avg_molecules <- function(dge){

  # calculate average expression per gene
  avg_mols <- rowMeans(dge[, -1])

  # set names
  structure(avg_mols, names = dge[, 1])

}

#' Number of molecules per gene
#'
#' Calculate the total number of molecules (UMIs) per gene across all cells.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @return A named vector containing total UMI counts per gene.
#'
#' @export
total_molecules <- function(dge){

  # calculate total molecules per gene
  total_mols <- rowSums(dge[, -1])

  # set names
  structure(total_mols, names = dge[, 1])

}
argschwind/dropseqr documentation built on May 23, 2019, 4:24 p.m.