mia: Microbiome analysis

.norm_f <- function(i, f, dim.type = c("rows","columns")){
    dim.type <- match.arg(dim.type)
    if(!is.character(f) && !is.factor(f)){
        stop("'f' must be a factor or character vector coercible to a ",
            "meaningful factor.",
            call. = FALSE)
    }
    if(i != length(f)){
        stop("'f' must have the same number of ",dim.type," as 'x'",
            call. = FALSE)
    }
    if(is.character(f)){
        f <- factor(f)
    }
    f
}

.norm_archetype <- function(f, archetype){
    if(length(archetype) > 1L){
        if(length(levels(f)) != length(archetype)){
            stop("length of 'archetype' must have the same length as ",
                "levels('f')",
                call. = FALSE)
        }
    }
    f_table <- table(f)
    if(!is.null(names(archetype))){
        if(anyNA(names(archetype)) || anyDuplicated(names(archetype))){
            stop("If 'archetype' is named, names must be non-NA and unqiue.",
                call. = FALSE)
        }
        archetype <- archetype[names(f_table)]
    }
    if(any(f_table < archetype)){
        stop("'archetype' out of bounds for some levels of 'f'. The maximum of",
            " 'archetype' is defined as table('f')", call. = FALSE)
    }
    if(length(archetype) == 1L){
        archetype <- rep(archetype,length(levels(f)))
    }
    archetype
}

#' @importFrom S4Vectors splitAsList
.get_element_pos <- function(f, archetype){
    archetype <- as.list(archetype)
    f_pos <- seq.int(1L, length(f))
    f_pos_split <- S4Vectors::splitAsList(f_pos, f)
    f_pos <- unlist(f_pos_split[archetype])
    f_pos
}

#' @importFrom S4Vectors SimpleList
#' @importFrom scuttle sumCountsAcrossFeatures
.merge_rows <- function(x, f, archetype = 1L, 
                        average = FALSE,
                        BPPARAM = SerialParam(),
                        check.assays = TRUE,
                        ...){
    # input check
    if( !.is_a_bool(average) ){
        stop("'average' must be TRUE or FALSE.", call. = FALSE)
    }
    if( !.is_a_bool(check.assays) ){
        stop("'check.assays' must be TRUE or FALSE.", call. = FALSE)
    }
    #
    if(is.character(f) && length(f)==1 && f %in% colnames(rowData(x))){
        f <- factor(as.character(rowData(x)[, f]))
    }
    else if(is.character(f) && length(f)==1 && f %in% colnames(colData(x))){
        f <- factor(as.character(colData(x)[, f]))
    } else 
    {
        f <- .norm_f(nrow(x), f)  
    }
    if(length(levels(f)) == nrow(x)){
        return(x)
    }
    
    archetype <- .norm_archetype(f, archetype)
    # merge assays
    assays <- assays(x)
    if( check.assays ){
        mapply(.check_assays_for_merge, names(assays), assays)
    }
    assays <- S4Vectors::SimpleList(lapply(assays,
                                            scuttle::sumCountsAcrossFeatures,
                                            ids = f, 
                                            subset.row = NULL, 
                                            subset.col = NULL,
                                            average = average,
                                            BPPARAM = BPPARAM))
    names(assays) <- names(assays(x))
    # merge to result
    x <- x[.get_element_pos(f, archetype = archetype),]
    assays(x, withDimnames = FALSE) <- assays
    # Change rownames to group names 
    rownames(x) <- rownames(assays[[1]])
    x
}

#' @importFrom scuttle sumCountsAcrossFeatures
.check_assays_for_merge <- function(assay.type, assay){
    # Check if assays include binary or negative values
    if( all(assay == 0 | assay == 1) ){
        warning("'",assay.type,"'", " includes binary values.",
                "\nAgglomeration of it might lead to meaningless values.", 
                "\nCheck the assay, and consider doing transformation again",
                "manually with agglomerated data.",
                call. = FALSE)
    }
    if( !all( assay >= 0 | is.na(assay) ) ){
        warning("'",assay.type,"'", " includes negative values.",
                "\nAgglomeration of it might lead to meaningless values.",
                "\nCheck the assay, and consider doing transformation again",
                "manually with agglomerated data.",
                call. = FALSE)
    }
}

#' @importFrom S4Vectors SimpleList
#' @importFrom scuttle summarizeAssayByGroup
.merge_cols <- function(x, f, archetype = 1L, ...){
    # input check
    if(is.character(f) && length(f)==1 && f %in% colnames(rowData(x))){
        f <- factor(as.character(rowData(x)[, f]))
    }
    else if(is.character(f) && length(f)==1 && f %in% colnames(colData(x))){
        f <- factor(as.character(colData(x)[, f]))
    } else 
    {
        f <- .norm_f(ncol(x), f, "columns")  
    }
    if(length(levels(f)) == ncol(x)){
        return(x)
    }
    archetype <- .norm_archetype(f, archetype)
    # merge col data
    element_pos <- .get_element_pos(f, archetype = archetype)
    col_data <- colData(x)[element_pos,,drop=FALSE]
    # merge assays
    assays <- assays(x)
    mapply(.check_assays_for_merge, names(assays), assays)
    FUN <- function(mat, ...){
        temp <- scuttle::summarizeAssayByGroup(mat,
                                                statistics = "sum",
                                                ...)
        # "sum" includes agglomerated (summed up) data
        mat <- assay(temp, "sum")
        return(mat)
    }
    assays <- S4Vectors::SimpleList(lapply(assays,
                                            FUN = FUN,
                                            ids = f, 
                                            subset.row = NULL, 
                                            subset.col = NULL,
                                            ...))
    names(assays) <- names(assays(x))
    # merge to result
    x <- x[,.get_element_pos(f, archetype = archetype)]
    assays(x, withDimnames = FALSE) <- assays
    # Change colnames to group names 
    colnames(x) <- colnames(assays[[1]])
    x
}

.merge_rows_SE <- function(x, f, archetype = 1L, ...){
    .merge_rows(x, f, archetype = archetype, ...)
}

.merge_cols_SE <- function(x, f, archetype = 1L, ...){
    .merge_cols(x, f, archetype = archetype, ...)
}

.merge_tree <- function(tree, links){
    tips <- sort(setdiff(tree$edge[, 2], tree$edge[, 1]))
    drop_tip <- tips[!(tips %in% unique(links$nodeNum[links$isLeaf]))]
    oldTree <- tree
    newTree <- ape::drop.tip(oldTree, tip = drop_tip)
    track <- trackNode(oldTree)
    track <- ape::drop.tip(track, tip = drop_tip)
    #
    oldAlias <- links$nodeLab_alias
    newNode <- convertNode(tree = track, node = oldAlias)
    newAlias <- convertNode(tree = newTree, node = newNode)
    #
    list(newTree = newTree, newAlias = newAlias)
}

# Merge trees, MARGIN specifies if trees are rowTrees or colTrees
.merge_trees <- function(x, mergeTree, MARGIN){
    # Get rowtrees or colTrees based on MARGIN
    if( MARGIN == 1 ){
        trees <- x@rowTree
        links <- rowLinks(x)
    } else{
        trees <- x@colTree
        links <- colLinks(x)
    }
    # If trees exist and mergeTree is TRUE
    if(!is.null(trees) && mergeTree){
        # Loop over trees and replace them one by one
        for( i in seq_len(length(trees)) ){
            # Get tree
            tree <- trees[[i]]
            # Get the name of the tree
            tree_name <- names(trees)[[i]]
            # Subset links by taking only those rows that are included in tree
            links_sub <- links[ links$whichTree == tree_name, , drop = FALSE ]
            # Merge tree
            tmp <- .merge_tree(tree, links_sub)
            # Based on MARGIN, replace ith rowTree or colTree
            if( MARGIN == 1 ){
                x <- changeTree(x = x,
                                rowTree = tmp$newTree,
                                rowNodeLab = tmp$newAlias,
                                whichRowTree = i
                )
            } else{
                x <- changeTree(x = x,
                                colTree = tmp$newTree,
                                colNodeLab = tmp$newAlias,
                                whichColTree = i
                )
            }
        }
    }
    return(x)
}

#' @importFrom Biostrings DNAStringSetList
.merge_refseq_list <- function(sequences_list, f, names, ...){
    threshold <- list(...)[["threshold"]]
    if(is.null(threshold)){
        threshold <- 0.05
    }
    if(!is(sequences_list,"DNAStringSetList")){
        return(.merge_refseq(sequences_list, f, names, threshold))
    }
    names <- names(sequences_list)
    seqs <- DNAStringSetList(lapply(sequences_list, .merge_refseq, f, names,
                                    threshold))
    names(seqs) <- names
    seqs
}

#' @importFrom Biostrings DNAStringSetList
#' @importFrom DECIPHER ConsensusSequence
.merge_refseq <- function(sequences, f, names, threshold){
    sequences <- split(sequences,f)
    seq <- unlist(DNAStringSetList(lapply(sequences, ConsensusSequence,
                                            threshold = threshold)))
    seq
}

.merge_rows_TSE <- function(x, f, archetype = 1L, mergeTree = FALSE, 
                            mergeRefSeq = FALSE, ...){
    # input check
    if(!.is_a_bool(mergeTree)){
        stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE)
    }
    if(!.is_a_bool(mergeRefSeq)){
        stop("'mergeRefSeq' must be TRUE or FALSE.", call. = FALSE)
    }
    # for optionally merging referenceSeq
    refSeq <- NULL
    if(mergeRefSeq){
        refSeq <- referenceSeq(x)
    }
    #
    x <- .merge_rows_SE(x, f, archetype = 1L, ...)
    # optionally merge rowTree
    x <- .merge_trees(x, mergeTree, 1)
    # optionally merge referenceSeq
    if(!is.null(refSeq)){
        referenceSeq(x) <- .merge_refseq_list(refSeq, f, rownames(x), ...)
    }
    x
}



.merge_cols_TSE <- function(x, f, archetype = 1L, mergeTree = FALSE, ...){
    # input check
    if(!.is_a_bool(mergeTree)){
        stop("'mergeTree' must be TRUE or FALSE.", call. = FALSE)
    }
    #
    x <- .merge_cols_SE(x, f, archetype = 1L, ...)
    # optionally merge colTree
    x <- .merge_trees(x, mergeTree, 2)
    return(x)
}