mia: Microbiome analysis

#' Split \code{TreeSummarizedExperiment} column-wise or row-wise based on grouping variable
#'
#' @param x A
#'   \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#'   object or a list of 
#'   \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}
#'   objects.
#'
#' @param f A single character value for selecting the grouping variable
#'   from \code{rowData} or \code{colData} or a \code{factor} or \code{vector} 
#'   with the same length as one of the dimensions. If \code{f} matches with both
#'   dimensions, \code{MARGIN} must be specified. 
#'   Split by cols is not encouraged, since this is not compatible with 
#'   storing the results in \code{altExps}.
#'
#' @param keep_reducedDims \code{TRUE} or \code{FALSE}: Should the
#'   \code{reducedDims(x)} be transferred to the result? Please note, that this
#'   breaks the link between the data used to calculate the reduced dims.
#'   (By default: \code{keep_reducedDims = FALSE})
#'   
#' @param update_rowTree \code{TRUE} or \code{FALSE}: Should the rowTree be updated
#'   based on splitted data? Option is enabled when \code{x} is a 
#'   \code{TreeSummarizedExperiment} object or a list of such objects. 
#'   (By default: \code{update_rowTree = FALSE})
#'   
#' @param altExpNames a \code{character} vector specifying the alternative experiments
#'   to be unsplit. (By default: \code{altExpNames = names(altExps(x))})
#'   
#' @param ... Arguments passed to \code{agglomerateByVariable} function for
#'   \code{SummarizedExperiment} objects and other functions.
#'   See \code{\link[=agglomerate-methods]{agglomerateByVariable}} for more 
#'   details.
#'   \itemize{
#'     \item{\code{use_names} A single boolean value to select whether to name elements of
#'     list by their group names.}
#'   }
#'
#'
#' @details
#' \code{splitOn} split data based on grouping variable. Splitting can be done
#' column-wise or row-wise. The returned value is a list of
#' \code{SummarizedExperiment} objects; each element containing members of each
#' group.
#'
#' @return
#' For \code{splitOn}: \code{SummarizedExperiment} objects in a \code{SimpleList}.
#'
#' For \code{unsplitOn}: \code{x}, with \code{rowData} and \code{assay}
#' data replaced by the unsplit data. \code{colData} of x is kept as well
#' and any existing \code{rowTree} is dropped as well, since existing
#' \code{rowLinks} are not valid anymore.
#' 
#' @name splitOn
#' @seealso
#' \code{\link[=splitByRanks]{splitByRanks}}
#' \code{\link[=unsplitByRanks]{unsplitByRanks}}
#' \code{\link[=agglomerate-methods]{agglomerateByVariable}},
#' \code{\link[scuttle:sumCountsAcrossFeatures]{sumCountsAcrossFeatures}},
#' \code{\link[=agglomerate-methods]{agglomerateByRank}},
#' \code{\link[SingleCellExperiment:altExps]{altExps}},
#' \code{\link[SingleCellExperiment:splitAltExps]{splitAltExps}}
#'
#' @export
#' @author Leo Lahti and Tuomas Borman. Contact: \url{microbiome.github.io}
#' 
#' @examples
#' data(GlobalPatterns)
#' tse <- GlobalPatterns
#' # Split data based on SampleType. 
#' se_list <- splitOn(tse, f = "SampleType")
#' 
#' # List of SE objects is returned. 
#' se_list
#' 
#' # Create arbitrary groups
#' rowData(tse)$group <- sample(1:3, nrow(tse), replace = TRUE)
#' colData(tse)$group <- sample(1:3, ncol(tse), replace = TRUE)
#' 
#' # Split based on rows
#' # Each element is named based on their group name. If you don't want to name
#' # elements, use use_name = FALSE. Since "group" can be found from rowdata and colData
#' # you must use MARGIN.
#' se_list <- splitOn(tse, f = "group", use_names = FALSE, MARGIN = 1)
#' 
#' # When column names are shared between elements, you can store the list to altExps
#' altExps(tse) <- se_list
#' 
#' altExps(tse)
#' 
#' # If you want to split on columns and update rowTree, you can do
#' se_list <- splitOn(tse, f = colData(tse)$group, update_rowTree = TRUE)
#' 
#' # If you want to combine groups back together, you can use unsplitBy
#' unsplitOn(se_list)
#' 
NULL

#' @rdname splitOn
#' @export
setGeneric("splitOn",
            signature = "x",
            function(x, ...)
                standardGeneric("splitOn"))

# This function collects f (grouping variable), MARGIN, and 
# use_names and returns them as a list.
.norm_args_for_split_by <- function(x, f, MARGIN = NULL, use_names = TRUE, ...){
    # input check
    # Check f
    if(is.null(f)){
        stop("'f' must either be a single non-empty character value or",
            " vector coercible to factor alongside the one of the dimensions of 'x'",
            call. = FALSE)
    }
    # Check MARGIN
    if( !(is.null(MARGIN) || (is.numeric(MARGIN) && (MARGIN == 1 || MARGIN == 2 ))) ){
        stop("'MARGIN' must be NULL, 1, or 2.", call. = FALSE )
    }
    # If f is a vector containing levels
    if( !.is_non_empty_string(f) ){
        # Convert into factors
        f <- factor(f, unique(f))
        # Check if the length of f matches with one of the dimensions
        if(!length(f) %in% dim(x)){
            stop("'f' must either be a single non-empty character value or",
                " vector coercible to factor alongside the on of the ",
                "dimensions of 'x'.",
                call. = FALSE)
        # If it matches with both dimensions, give error if MARGIN is not specified
        } else if( is.null(MARGIN) && all(length(f) == dim(x)) ){
            stop("The length of 'f' matches with nrow and ncol. ",
                "Please specify 'MARGIN'.", call. = FALSE)
        # If MARGIN is specified but it does not match with length of f
        } else if( !is.null(MARGIN) && (length(f) !=  dim(x)[[MARGIN]]) ){
            stop("'f' does not match with ", 
                ifelse(MARGIN==1, "nrow", "ncol"), ". Please check 'MARGIN'.",
                call. = FALSE)
        # IF f matches with nrow
        } else if(length(f) == dim(x)[[1]] && is.null(MARGIN)  ){
            MARGIN <- 1L
        # If f matches with ncol
        } else if( is.null(MARGIN) ){
            MARGIN <- 2L
        }
    # Else if f is a character specifying column from rowData or colData  
    } else {
        # If MARGIN is specified
        if( !is.null(MARGIN) ){
            # Search from rowData or colData based on MARGIN
            dim_name <- switch(MARGIN,
                                "1" = "rowData",
                                "2" = "colData")
            # Specify right function
            dim_FUN <- switch(MARGIN,
                                "1" = retrieveFeatureInfo,
                                "2" = retrieveCellInfo)
            # Try to get information
            tmp <- try({dim_FUN(x, f, search = dim_name)},
                        silent = TRUE)
            # Give error if it cannot be found
            if(is(tmp,"try-error")){
                stop("'f' is not found. ",
                    "Please check that 'f' specifies a column from ", dim_name, ".", 
                    call. = FALSE)
            }
            # Get values
            f <- tmp$value
        # Else if MARGIN is not specified
        } else{
            # Try to get information from rowData
            tmp_row <- try({retrieveFeatureInfo(x, f, search = "rowData")},
                            silent = TRUE)
            # Try to get information from colData
            tmp_col <- try({retrieveCellInfo(x, f, search = "colData")}, 
                            silent = TRUE)
            
            # If it was not found 
            if( is(tmp_row, "try-error") && is(tmp_col, "try-error") ){
                stop("'f' is not found. ",
                    "Please check that 'f' specifies a column from ",
                    "rowData or colData.", 
                    call. = FALSE)
                # If f was found from both
            } else if( !is(tmp_row, "try-error") && !is(tmp_col, "try-error") ){
                stop("'f' can be found from both rowData and colData. ",
                    "Please specify 'MARGIN'.",
                    call. = FALSE)
                # If it was found from rowData
            } else if( !is(tmp_row, "try-error") ){
                MARGIN <- 1L
                # Get values
                f <- tmp_row$value
                # Otherwise, it was found from colData
            } else{
                MARGIN <- 2L
                # Get values
                f <- tmp_col$value
            }
        }
        # Convert values into factors
        f <- factor(f, unique(f))
        
        # If there are NAs, add NA as level
        if( any(is.na(f)) ){
            f <- addNA(f)
        }
    }
    # Check use_names
    if( !.is_a_bool(use_names) ){
        stop("'use_names' must be TRUE or FALSE.",
            call. = FALSE)
    }
    # Create a list from arguments
    list(f = f,
        MARGIN = MARGIN,
        use_names = use_names)
}

# PErform the split
.split_on <- function(x, args, ...){
    # Get grouping variable and its values
    f <- args[["f"]]
    # Choose nrow or ncol based on MARGIN
    dim_FUN <- switch(args[["MARGIN"]],
                        "1" = nrow,
                        "2" = ncol)
    # Get indices from 1 to nrow/ncol
    idx <- seq_len(dim_FUN(x))
    # Split indices into groups based on grouping variable
    idxs <- split(idx, f)
    # Subset function takes SE and list of groups which have indices
    # It divides the data into groups
    subset_FUN <- function(x, i = TRUE, j = TRUE){
        x[i, j]
    }
    # Based on MARGIN, divide data in row-wise or column-wise
    if(args[["MARGIN"]] == 1){
        ans <- SimpleList(lapply(idxs, subset_FUN, x = x))
    } else {
        ans <- SimpleList(lapply(idxs, subset_FUN, x = x, i = TRUE))
    }
    # If user do not want to use names, unname
    if(!args[["use_names"]]){
        ans <- unname(ans)
    # Otherwise convert NAs to "NA", if there is a level that do not have name
    } else{
        names(ans)[ is.na(names(ans)) ] <- "NA"
    }
    ans
}

#' @rdname splitOn
#' @export
setMethod("splitOn", signature = c(x = "SummarizedExperiment"),
    function(x, f = NULL,  ...){
        # Get arguments
        args <- .norm_args_for_split_by(x, f = f, ...)
        # Split data
        .split_on(x, args, ...)
    }
)

#' @rdname splitOn
#' @export
setMethod("splitOn", signature = c(x = "SingleCellExperiment"),
    function(x, f = NULL, ...){
        # Get arguments
        args <- .norm_args_for_split_by(x, f = f, ...)
        # Should alternative experiment be removed? --> yes
        args[["strip_altexp"]] <- TRUE
        # Split data
        .split_on(x, args, ...)
    }
)

#' @rdname splitOn
#' @export
setMethod("splitOn", signature = c(x = "TreeSummarizedExperiment"),
    function(x, f = NULL, update_rowTree = FALSE,
            ...){
        # Input check
        # Check update_rowTree
        if( !.is_a_bool(update_rowTree) ){
            stop("'update_rowTree' must be TRUE or FALSE.",
                call. = FALSE)
        }
        # Input check end
        # Split data
        x <- callNextMethod()
        # Manipulate rowTree or not?
        if( update_rowTree ){
            # If the returned value is a list, go through all of them
            if( is(x, 'SimpleList') ){
                x <- SimpleList(lapply(x, .agglomerate_trees))

            } else {
                # Otherwise, the returned value is TreeSE
                x <- .agglomerate_trees(x)
            }
        }
        x
    }
)

################################################################################
# unsplitOn

#' @rdname splitOn
#' @export
setGeneric("unsplitOn",
            signature = c("x"),
            function(x, ...)
                standardGeneric("unsplitOn"))

# Perform the unsplit
.list_unsplit_on <- function(ses, update_rowTree = FALSE, MARGIN = NULL, ...){
    # Input check
    is_check <- vapply(ses,is,logical(1L),"SummarizedExperiment")
    if(!all(is_check)){
        stop("Input must be a list of SummarizedExperiment or derived objects ",
            "only.",
            call. = FALSE)
    }
    # Check update_rowTree
    if( !.is_a_bool(update_rowTree) ){
        stop("'update_rowTree' must be TRUE or FALSE.",
            call. = FALSE)
    }
    if( !(is.null(MARGIN) || (is.numeric(MARGIN) && (MARGIN == 1 || MARGIN == 2 ))) ){
        stop("'MARGIN' must be NULL, 1, or 2.", call. = FALSE )
    }
    # Input check end
    # If list contains only one element, return it
    if( length(ses) == 1 ){
        return(ses[[1]])
    }
    # Get dimensions of each SE in the list
    dims <- vapply(ses, dim, integer(2L))
    # Based on which dimension SE objects share, select MARGIN.
    # If they share rows, then MARGIN is col, and vice versa
    if( is.null(MARGIN) ){
        if( length(unique(dims[1L,])) == 1 && length(unique(dims[2L,])) == 1 ){
            stop("The dimensions match with row and column-wise. ",
                "Please specify 'MARGIN'.", call. = FALSE)
        } else if(length(unique(dims[1L,])) == 1L){
            MARGIN <- 2L
        } else if(length(unique(dims[2L,])) == 1L) {
            MARGIN <- 1L
        } else {
            stop("The dimensions are not equal across all elements. ", 
                "Please check that either number of rows or columns match.", 
                call. = FALSE)
        }
    } else{
        # Get correct dimension, it is opposite of MARGIN
        dim <- ifelse(MARGIN == 1, 2, 1)
        if( length(unique(dims[dim,])) != 1L ){
            stop("The dimensions are not equal across all elements.", call. = FALSE)
        }
    }
    
    # Get the class of objects SCE, SE or TreeSE
    class_x <- class(ses[[1L]])
    # Combine assays
    args <- list(assays = .unsplit_assays(ses, MARGIN = MARGIN))
    # Combine rowData if data share columns
    if(MARGIN == 1L){
        rd <- .combine_rowData(ses)
        # Add rownames since they are missing after using combining
        rownames(rd) <- unlist(unname(lapply(ses, rownames)))
        rr <- .combine_rowRanges(ses)
        args$rowRanges <- rr
        args$colData <- colData(ses[[1L]])
    # Combine colData if data share rows
    } else {
        args$colData <- .combine_colData(ses)
        args$rowRanges <- rowRanges(ses[[1L]])
        rd <- rowData(ses[[1L]])
    }
    # Create a object specified by class_x from the data
    ans <- do.call(class_x, args)
    # Add rowData
    rowData(ans) <- rd
    # Update rownames
    rownames(ans) <- rownames(rd)
    
    # IF the object is TreeSE. add rowTree
    if( class_x == "TreeSummarizedExperiment" ){
        # Update or add old tree from the first element of list
        if( update_rowTree ){
            ans <- addHierarchyTree(ans)
        } else{
            rowTree(ans) <- rowTree(ses[[1L]])
        }
    }
    ans
}

#' @importFrom SummarizedExperiment colData
#' @importFrom BiocGenerics rbind
.combine_colData <- function(ses) {
    # Get colDatas of objects
    cds <- lapply(ses, colData)
    # Bind them together row-wise
    cd <- do.call(rbind,unname(cds))
    # Add sample names
    rownames(cd) <- unlist(unname(lapply(ses, colnames)))
    cd
}


#' @rdname splitOn
#' @importFrom SingleCellExperiment altExpNames altExp altExps
#' @export
setMethod("unsplitOn", signature = c(x = "list"),
    function(x, update_rowTree = FALSE, ...){
        # Unsplit list and create SCE, SE, or TreeSE from it
        .list_unsplit_on(x, update_rowTree, ...)
    }
)
#' @rdname splitOn
#' @importFrom SingleCellExperiment altExpNames altExp altExps
#' @export
setMethod("unsplitOn", signature = c(x = "SimpleList"),
    function(x, update_rowTree = FALSE, ...){
        unsplitOn(as.list(x), update_rowTree, ...)
    }
)

#' @rdname splitOn
#' @importFrom SingleCellExperiment altExpNames altExp altExps reducedDims<-
#' @export
setMethod("unsplitOn", signature = c(x = "SingleCellExperiment"),
    function(x, altExpNames = names(altExps(x)), keep_reducedDims = FALSE, ...){
        # input check
        if(!.is_a_bool(keep_reducedDims)){
            stop("'keep_reducedDims' must be TRUE or FALSE.", call. = FALSE)
        }
        # Get alternative experiment names since data is located there
        ae_names <- names(altExps(x))
        # Get only those experiments that user has specified
        ae_names <- ae_names[ae_names %in% altExpNames]
        if(length(ae_names) == 0L){
            stop("No altExp matching 'altExpNames' in name.", call. = FALSE)
        }
        # Get alternative experiments as a list
        ses <- altExps(x)[ae_names]
        # And unsplit the data
        se <- .list_unsplit_on(ses, ...)
        # Add reducedDims if specified
        if( keep_reducedDims ){
            reducedDims(se) <- reducedDims(x)
        }
        return(se)
    }
)