R/aggregate_mapped_genes.R

Defines functions aggregate_mapped_genes

Documented in aggregate_mapped_genes

#' Aggregate/expand a gene matrix by gene mappings
#'
#' Aggregate/expand a gene matrix (\code{gene_df}) using a gene mapping 
#' \link[base]{data.frame} (\code{gene_map}). 
#' Importantly, mappings can be performed across a variety of scenarios that
#' can occur during within-species and between-species gene mapping:  
#' \itemize{
#' \item{1 gene : 1 gene}
#' \item{many genes : 1 gene}
#' \item{1 gene : many genes}
#' \item{many genes : many genes}
#' }
#' For more details on how aggregation/expansion is performed, 
#' please see: \link[orthogene]{many2many_rows}. 
#' 
#' @param gene_df Input matrix where row names are genes.
#' @param agg_fun Aggregation function (\emph{DEFAULT:} \code{"sum"}). 
#' @param agg_method Aggregation method.
#' @param agg_fun Aggregation function. 
#' @param transpose Transpose \code{gene_df} before mapping genes.
#' @param gene_map A \link[base]{data.frame} that maps the current gene names
#' to new gene names. 
#' This function's behaviour will adapt to different situations as follows: 
#' \itemize{
#' \item{\code{gene_map=<data.frame>} :\cr}{ When a data.frame containing the
#' gene key:value columns 
#' (specified by \code{input_col} and \code{output_col}, respectively)
#' is provided, this will be used to perform aggregation/expansion.}
#' \item{\code{gene_map=NULL} and \code{input_species!=output_species} :\cr}{
#' A \code{gene_map} is automatically generated by
#'  \link[orthogene]{map_orthologs} to perform inter-species 
#'  gene aggregation/expansion.}
#' \item{\code{gene_map=NULL} and \code{input_species==output_species} :\cr}{
#' A \code{gene_map} is automatically generated by
#'  \link[orthogene]{map_genes} to perform within-species 
#'  gene gene symbol standardization and aggregation/expansion.}
#' }
#' @param as_sparse Convert aggregated matrix to sparse matrix.
#' @param as_DelayedArray Convert aggregated matrix to
#' \link[DelayedArray]{DelayedArray}.
#' @param dropNA Drop genes assigned to \code{NA} in \code{groupings}.
#' @param sort_rows Sort \code{gene_df} rows alphanumerically.
#' @param as_integers Force all values in the matrix to become integers, 
#' by applying \link[base]{floor} (default: \code{FALSE}). 
#' @param verbose Print messages.
#'
#' @inheritParams map_genes
#' @inheritParams convert_orthologs
#' @inheritParams many2many_rows
#' @inheritParams gprofiler2::gconvert
#'
#' @return Aggregated matrix
#' @export
#' @importFrom Matrix t
#' @importFrom stats setNames 
#' @examples 
#' #### Aggregate within species: gene synonyms ####
#' data("exp_mouse_enst")                                
#' X_agg <- aggregate_mapped_genes(gene_df = exp_mouse_enst, 
#'                                 input_species = "mouse")  
#'                                  
#' #### Aggregate across species: gene orthologs ####               
#' data("exp_mouse")
#' X_agg2 <- aggregate_mapped_genes(gene_df = exp_mouse, 
#'                                  input_species = "mouse",
#'                                  output_species = "human",
#'                                  method="homologene")                                                     
aggregate_mapped_genes <- function(gene_df,
                                   gene_map = NULL,
                                   input_col = "input_gene",
                                   output_col = "ortholog_gene",
                                   input_species = "human",
                                   output_species = input_species,
                                   method = c(
                                       "gprofiler",
                                       "homologene",
                                       "babelgene"
                                   ),
                                   agg_fun = "sum",
                                   agg_method = c("monocle3", "stats"),
                                   aggregate_orthologs = TRUE,
                                   transpose = FALSE, 
                                   mthreshold = 1,
                                   target = "ENSG",
                                   numeric_ns = "",
                                   as_integers = FALSE,
                                   as_sparse = TRUE,
                                   as_DelayedArray = FALSE,
                                   dropNA = TRUE,
                                   sort_rows = FALSE,
                                   verbose = TRUE) {
    
    # echoverseTemplate:::source_all(packages = "dplyr")
    # devoptera::args2vars(aggregate_mapped_genes)
    
    #### Transpose matrix first (optional) ####
    if (isTRUE(transpose)) {
        gene_df <- Matrix::t(gene_df)
    } 
    og_dim <- dim(gene_df)
    ### Create gene_map if none provided #### 
    input_genes <- rownames(gene_df)
    if (is.null(gene_map)) { 
        #### Translate species ####
        if(input_species!=output_species){
            gene_map <- map_orthologs(genes = input_genes,
                                      standardise_genes = FALSE, 
                                      input_species = input_species, 
                                      output_species = output_species, 
                                      method = method,
                                      mthreshold = mthreshold,
                                      verbose = verbose)
        } else {
            #### Standardize genes ####
            gene_map <- map_genes(
                genes = input_genes,
                species = output_species,
                run_map_species = TRUE,
                ## Important!:
                # mthreshold=1 means gene can only appear once in input col.
                # Ensures that nrow(gene_map)==nrow(X)
                mthreshold = mthreshold,
                drop_na = FALSE,
                target = target, 
                numeric_ns = numeric_ns, 
                verbose = verbose
            )
            input_col <- "input"
            output_col <- "name"
        } 
    }  
    #### Check input_col/output_col ####
    check_gene_map(gene_map = gene_map, 
                   input_col = input_col,
                   output_col = output_col)    
    output_genes <- gene_map[[output_col]]
    #### Check if numbers of UNIQUE old/new genes are equal ####
    ## If so, just rename rows and exit early. 
    if(length(unique(input_genes)) == 
       length(unique(output_genes)) ){
        messager("gene_map has the same number of rows as gene_df", 
                 "and thus there is nothing to aggregate.",
                 "Returning gene_df with gene names from gene_map instead",
                 v=verbose)
        gene_dict <- stats::setNames(output_genes,input_genes)
        rownames(gene_df) <- gene_dict[rownames(gene_df)]
        is_na <- find_all_nas(v = rownames(gene_df))
        if(dropNA) gene_df <- gene_df[!is_na,]
        return(gene_df)
    }
    #### Aggregate genes #### 
    if(nrow(gene_map) == nrow(gene_df)){
        ## Round 1: map input gene names onto smaller subset of 
        ## standardized gene names. 
        if(length(unique(input_genes))!=nrow(gene_df)){
            gene_df <- aggregate_rows(
                X = gene_df,
                groupings = gene_map[[input_col]],
                agg_fun = agg_fun,
                agg_method = agg_method,
                as_sparse = as_sparse,
                as_DelayedArray = as_DelayedArray,
                dropNA = dropNA,
                verbose = verbose
            )
        } 
        ## Round 2: map input gene names onto output genes.. 
        gene_df <- aggregate_rows(
            X = gene_df,
            groupings = gene_map[[output_col]],
            agg_fun = agg_fun,
            agg_method = agg_method,
            as_sparse = as_sparse,
            as_DelayedArray = as_DelayedArray,
            dropNA = dropNA,
            verbose = verbose
        )
    } else {
        #### Less efficient but more flexible aggregation/expansion method ####
        ## Rounds 1 and 2 are wrapped within this one function. 
        gene_df <- many2many_rows(
            X = gene_df, 
            gene_map = gene_map,
            agg_fun = agg_fun, 
            agg_method = agg_method,
            input_col = input_col,
            output_col = output_col,
            aggregate_orthologs = aggregate_orthologs,
            dropNA = dropNA,
            as_sparse = as_sparse, 
            as_DelayedArray = as_DelayedArray,
            verbose = verbose
            )
    }  
    if(as_integers){
        gene_df <- to_sparse(as.matrix(floor(gene_df)))
    }
    if (sort_rows) {
        gene_df <- sort_rows_func(
            gene_df = gene_df,
            verbose = verbose
        )
    }
    messager("Matrix aggregated:\n",
        " - Input:", paste(formatC(og_dim, big.mark = ","),
            collapse = " x "
        ), "\n",
        " - Output:", paste(formatC(dim(gene_df), big.mark = ","),
            collapse = " x "
        ),
        v = verbose
    )
    return(gene_df)
}
neurogenomics/orthogene documentation built on Jan. 30, 2024, 4:44 a.m.