#' Sample cells and apply function
#'
#' Sample a certain number of cells from a DGE data.frame and apply a specific
#' function to the subset.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @param size Number of cells that should be sampled. If different sizes should
#' be sampled each specified group, \code{size} needs to be a vector of length
#' \code{length(levels(groups))} providing sizes for each group of cells. If
#' \code{groups} is used it's recommendend that \code{size} is a named vector
#' with group ids as names, as this will ensure correct sizes per group.
#'
#' @param fun Function to apply to sampled cells. Needs to take a DGE data.frame
#' as it's first positional argument.
#'
#' @param groups Optional; a factor of the same length as the number of cells
#' providing group assignments for each cell in dge. If \code{groups = NULL} all
#' cells are assigned to the same group.
#'
#' @param n Number of times a sample is drawn and the function is applied.
#'
#' @param simplify Logical or character string specifying if and how the output
#' should be simplified. See help for \link[base]{sapply} for more information.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @param funArgs Optional; a list with additional arguments for the applied
#' function \code{fun}.
#'
#' @return A list of output objects from each sampling iteration of the selected
#' function \code{fun} if \code{simplify = FALSE}. If \code{simplify = TRUE} a
#' vector, matrix, or, if \code{simplify = "array"}, an array if appropriate.
#'
#' @export
cell_sampler <- function(dge, size, fun, groups = NULL, n = 1, simplify = TRUE,
replace = TRUE, funArgs = list()){
# assign all cells to one group if not specified otherwise
if (is.null(groups)){
groups <- factor(rep(1, times = ncol(dge) - 1))
}
# verify input ---------------------------------------------------------------
# check that groups is a factor of length equal to the number of cells
if (!is.factor(groups) | length(groups) != ncol(dge) - 1){
stop("groups needs to be a factor of length equal to the number of cells")
}
# group ids, number of groups and sizes
group_ids <- levels(groups)
n_groups <- length(group_ids)
n_size <- length(size)
# make sure that each group has a size
if(n_size == 1){
# use the same size for all groups if only one size is provided
size <- rep(size, times = n_groups)
}else if (n_size != n_groups){
# abort if more than one size is provided, but the number of sizes do not
# match the number of groups
stop("size needs to be of the same length as the number of groups")
}
# order size according to group ids, if size has group ids as names
if (!is.null(names(size))){
# get group ids present in both groups and size
common_ids <- intersect(group_ids, names(size))
# sort size in the same order as group ids
if (length(common_ids) == n_groups) {
size <- size[group_ids]
}else{
stop("group ids and names of size do not agree")
}
}
# sample cells and apply function n times ------------------------------------
# build list of arguments for sample_cells_apply()
argList <- list(dge, size, groups, fun, funArgs, replace)
# sample cells and apply function fun n times using sample_cells_apply()
sapply(1:n,
FUN = function(i, args){
# call sample_cells_apply() with argument list args
do.call(sample_cells_apply, args)
},
args = argList,
simplify = simplify)
}
#' Sample cells and apply function
#'
#' Sample a certain number of cells from a DGE data.frame and apply a specific
#' function to the subset.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @param sizes Number of cells that should be sampled from each group.
#'
#' @param groups A factor of length \code{ncol(dge) - 1} providing group
#' assignments for each cell in dge.
#'
#' @param fun Function to apply to sampled cells.
#'
#' @param funArgs A list with additional arguments for the applied function
#' \code{fun}.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @return Output of function \code{fun} applied to sampled cells.
sample_cells_apply <- function(dge, sizes, groups, fun, funArgs,
replace = TRUE){
# sample cells from each group of cells
dge_sample <- sample_cells(dge, sizes = sizes, groups = groups,
replace = replace)
# build arguments list for function fun (basically upack funArgs)
args <- c(list(dge_sample), funArgs)
# apply function to sampled cells
do.call(fun, args)
}
#' Randomly sample cells
#'
#' Sample a specific numbers of cells different subgroups of cells in a DGE
#' data.frame.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @param sizes Number of cells that should be sampled from each group.
#'
#' @param groups A factor of length \code{ncol(dge) - 1} providing group
#' assignments for each cell in dge.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @return A data.frame with sampled DGE data in the same format as \code{dge}.
sample_cells <- function(dge, sizes, groups, replace = TRUE){
# randomly select indices of cells (cols in dge) from each group (levels)
cells <- sample_from_levels(groups, sizes = sizes,
replace = replace) + 1 # + 1 for gene id in dge
# get dge for these cells
dge[, c(1, cells)]
}
#' Sample from levels of a factor
#'
#' Randomly select a specified number elements from each level (group) of a
#' factor.
#'
#' @param x A factor from whose levels should be sampled.
#'
#' @param sizes Number of elements that should be sampled from each level of
#' \code{x}. Assumed to have the same length as the number of elements.
#'
#' @param replace Should samples be drawn with replacement?
#'
#' @return An numeric vector containing indices of the elements sampled from
#' each level.
sample_from_levels <- function(x, sizes, replace = TRUE){
# function to sample from a specific level of a vector
sample_from_level <- function(level, size, vector, replace){
# get indices of elements of a specific level
indices <- which(vector == level)
# sample from these indices
sample(indices, size = size, replace = replace)
}
# sample from each level of x
sampled_x <- mapply(FUN = sample_from_level, levels(x), sizes,
MoreArgs = list(x, replace),
SIMPLIFY = FALSE)
# remove names
names(sampled_x) <- NULL
# convert to one vector
unlist(sampled_x)
}
# example functions to apply ---------------------------------------------------
#' Average molecules per gene
#'
#' Calculate the average number of molecules (UMIs) per gene across all cells.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @return A named vector containing average UMI counts per gene.
#'
#' @export
avg_molecules <- function(dge){
# calculate average expression per gene
avg_mols <- rowMeans(dge[, -1])
# set names
structure(avg_mols, names = dge[, 1])
}
#' Number of molecules per gene
#'
#' Calculate the total number of molecules (UMIs) per gene across all cells.
#'
#' @param dge A data.frame containing DGE data, with rows being genes and
#' columns cells. The first column is assumed to provide the gene identifiers.
#'
#' @return A named vector containing total UMI counts per gene.
#'
#' @export
total_molecules <- function(dge){
# calculate total molecules per gene
total_mols <- rowSums(dge[, -1])
# set names
structure(total_mols, names = dge[, 1])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.