#' Aggregate/expand a gene matrix by gene mappings
#'
#' Aggregate/expand a gene matrix (\code{gene_df}) using a gene mapping
#' \link[base]{data.frame} (\code{gene_map}).
#' Importantly, mappings can be performed across a variety of scenarios that
#' can occur during within-species and between-species gene mapping:
#' \itemize{
#' \item{1 gene : 1 gene}
#' \item{many genes : 1 gene}
#' \item{1 gene : many genes}
#' \item{many genes : many genes}
#' }
#' For more details on how aggregation/expansion is performed,
#' please see: \link[orthogene]{many2many_rows}.
#'
#' @param gene_df Input matrix where row names are genes.
#' @param agg_fun Aggregation function (\emph{DEFAULT:} \code{"sum"}).
#' @param agg_method Aggregation method.
#' @param agg_fun Aggregation function.
#' @param transpose Transpose \code{gene_df} before mapping genes.
#' @param gene_map A \link[base]{data.frame} that maps the current gene names
#' to new gene names.
#' This function's behaviour will adapt to different situations as follows:
#' \itemize{
#' \item{\code{gene_map=<data.frame>} :\cr}{ When a data.frame containing the
#' gene key:value columns
#' (specified by \code{input_col} and \code{output_col}, respectively)
#' is provided, this will be used to perform aggregation/expansion.}
#' \item{\code{gene_map=NULL} and \code{input_species!=output_species} :\cr}{
#' A \code{gene_map} is automatically generated by
#' \link[orthogene]{map_orthologs} to perform inter-species
#' gene aggregation/expansion.}
#' \item{\code{gene_map=NULL} and \code{input_species==output_species} :\cr}{
#' A \code{gene_map} is automatically generated by
#' \link[orthogene]{map_genes} to perform within-species
#' gene gene symbol standardization and aggregation/expansion.}
#' }
#' @param as_sparse Convert aggregated matrix to sparse matrix.
#' @param as_DelayedArray Convert aggregated matrix to
#' \link[DelayedArray]{DelayedArray}.
#' @param dropNA Drop genes assigned to \code{NA} in \code{groupings}.
#' @param sort_rows Sort \code{gene_df} rows alphanumerically.
#' @param as_integers Force all values in the matrix to become integers,
#' by applying \link[base]{floor} (default: \code{FALSE}).
#' @param verbose Print messages.
#'
#' @inheritParams map_genes
#' @inheritParams convert_orthologs
#' @inheritParams many2many_rows
#' @inheritParams gprofiler2::gconvert
#'
#' @return Aggregated matrix
#' @export
#' @importFrom Matrix t
#' @importFrom stats setNames
#' @examples
#' #### Aggregate within species: gene synonyms ####
#' data("exp_mouse_enst")
#' X_agg <- aggregate_mapped_genes(gene_df = exp_mouse_enst,
#' input_species = "mouse")
#'
#' #### Aggregate across species: gene orthologs ####
#' data("exp_mouse")
#' X_agg2 <- aggregate_mapped_genes(gene_df = exp_mouse,
#' input_species = "mouse",
#' output_species = "human",
#' method="homologene")
aggregate_mapped_genes <- function(gene_df,
gene_map = NULL,
input_col = "input_gene",
output_col = "ortholog_gene",
input_species = "human",
output_species = input_species,
method = c(
"gprofiler",
"homologene",
"babelgene"
),
agg_fun = "sum",
agg_method = c("monocle3", "stats"),
aggregate_orthologs = TRUE,
transpose = FALSE,
mthreshold = 1,
target = "ENSG",
numeric_ns = "",
as_integers = FALSE,
as_sparse = TRUE,
as_DelayedArray = FALSE,
dropNA = TRUE,
sort_rows = FALSE,
verbose = TRUE) {
# echoverseTemplate:::source_all(packages = "dplyr")
# devoptera::args2vars(aggregate_mapped_genes)
#### Transpose matrix first (optional) ####
if (isTRUE(transpose)) {
gene_df <- Matrix::t(gene_df)
}
og_dim <- dim(gene_df)
### Create gene_map if none provided ####
input_genes <- rownames(gene_df)
if (is.null(gene_map)) {
#### Translate species ####
if(input_species!=output_species){
gene_map <- map_orthologs(genes = input_genes,
standardise_genes = FALSE,
input_species = input_species,
output_species = output_species,
method = method,
mthreshold = mthreshold,
verbose = verbose)
} else {
#### Standardize genes ####
gene_map <- map_genes(
genes = input_genes,
species = output_species,
run_map_species = TRUE,
## Important!:
# mthreshold=1 means gene can only appear once in input col.
# Ensures that nrow(gene_map)==nrow(X)
mthreshold = mthreshold,
drop_na = FALSE,
target = target,
numeric_ns = numeric_ns,
verbose = verbose
)
input_col <- "input"
output_col <- "name"
}
}
#### Check input_col/output_col ####
check_gene_map(gene_map = gene_map,
input_col = input_col,
output_col = output_col)
output_genes <- gene_map[[output_col]]
#### Check if numbers of UNIQUE old/new genes are equal ####
## If so, just rename rows and exit early.
if(length(unique(input_genes)) ==
length(unique(output_genes)) ){
messager("gene_map has the same number of rows as gene_df",
"and thus there is nothing to aggregate.",
"Returning gene_df with gene names from gene_map instead",
v=verbose)
gene_dict <- stats::setNames(output_genes,input_genes)
rownames(gene_df) <- gene_dict[rownames(gene_df)]
is_na <- find_all_nas(v = rownames(gene_df))
if(dropNA) gene_df <- gene_df[!is_na,]
return(gene_df)
}
#### Aggregate genes ####
if(nrow(gene_map) == nrow(gene_df)){
## Round 1: map input gene names onto smaller subset of
## standardized gene names.
if(length(unique(input_genes))!=nrow(gene_df)){
gene_df <- aggregate_rows(
X = gene_df,
groupings = gene_map[[input_col]],
agg_fun = agg_fun,
agg_method = agg_method,
as_sparse = as_sparse,
as_DelayedArray = as_DelayedArray,
dropNA = dropNA,
verbose = verbose
)
}
## Round 2: map input gene names onto output genes..
gene_df <- aggregate_rows(
X = gene_df,
groupings = gene_map[[output_col]],
agg_fun = agg_fun,
agg_method = agg_method,
as_sparse = as_sparse,
as_DelayedArray = as_DelayedArray,
dropNA = dropNA,
verbose = verbose
)
} else {
#### Less efficient but more flexible aggregation/expansion method ####
## Rounds 1 and 2 are wrapped within this one function.
gene_df <- many2many_rows(
X = gene_df,
gene_map = gene_map,
agg_fun = agg_fun,
agg_method = agg_method,
input_col = input_col,
output_col = output_col,
aggregate_orthologs = aggregate_orthologs,
dropNA = dropNA,
as_sparse = as_sparse,
as_DelayedArray = as_DelayedArray,
verbose = verbose
)
}
if(as_integers){
gene_df <- to_sparse(as.matrix(floor(gene_df)))
}
if (sort_rows) {
gene_df <- sort_rows_func(
gene_df = gene_df,
verbose = verbose
)
}
messager("Matrix aggregated:\n",
" - Input:", paste(formatC(og_dim, big.mark = ","),
collapse = " x "
), "\n",
" - Output:", paste(formatC(dim(gene_df), big.mark = ","),
collapse = " x "
),
v = verbose
)
return(gene_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.