R/allClass.R

Defines functions .linkFun TreeSummarizedExperiment LinkDataFrame

Documented in LinkDataFrame TreeSummarizedExperiment

# ======================
### class
# ======================

#-------------------------------------------------------------------------------
#' phylo: A S3 class for the phylogenetic tree
#-------------------------------------------------------------------------------
#'
#' The \pkg{ape} package does not export its phylo class, probably because it
#' is not really defined formally anywhere. Technically, it is an S3 class
#' extended from the class list. Any exported definitions from the \pkg{ape}
#' package would be preferred to use if available.
#' @importFrom methods setOldClass
#' @keywords internal
phylo <- structure(list(), class = "phylo")
setOldClass("phylo")

setClassUnion("list_Or_NULL", c("list", "NULL"))
#-------------------------------------------------------------------------------
#' LinkDataFrame: A S4 class extended from DataFrame
#-------------------------------------------------------------------------------
#' An S4 class LinkDataFrame
#'
#' The \strong{LinkDataFrame} is extended from the class \strong{DataFrame} to
#' include at least four columns \code{nodeLab}, \code{nodeLab_alias},
#' \code{nodeNum}, and \code{isLeaf}.
#'
#' @importClassesFrom S4Vectors DataFrame
#' @exportClass LinkDataFrame
#'
#' @section Constructor:
#' See \code{\link{LinkDataFrame-constructor}} for constructor
#' functions.
#'
setClass("LinkDataFrame",
         contains = "DFrame")

setClassUnion("LinkDataFrame_Or_NULL", c("LinkDataFrame", "NULL"))
#-------------------------------------------------------------------------------
#' Construct a LinkDataFrame
#-------------------------------------------------------------------------------
#' Construct a LinkDataFrame object
#' @param nodeLab A character vector
#' @param nodeLab_alias A character vector
#' @param nodeNum A numeric vector
#' @param isLeaf A logical vector
#' @param whichTree A character vector
#' @param ... All arguments accepted by
#'   \code{\link[S4Vectors:DataFrame-class]{DataFrame-class}}.
#'
#' @importFrom S4Vectors DataFrame
#' @name LinkDataFrame-constructor
#' @export
#' @return A LinkDataFrame object
#' @seealso \code{\link[=LinkDataFrame-class]{LinkDataFrame}}
#'   \code{\link[S4Vectors:DataFrame-class]{DataFrame}}
#' @examples
#'
#'
#' (ld <- LinkDataFrame(nodeLab = letters[1:5],
#'                      nodeLab_alias = LETTERS[1:5],
#'                      nodeNum = 1:5,
#'                      isLeaf = TRUE,
#'                      whichTree = LETTERS[1:5],
#'                      right = 1:5))
LinkDataFrame <- function(nodeLab, nodeLab_alias, nodeNum,
                          isLeaf, whichTree, ...) {
    df <- DataFrame(nodeLab, nodeLab_alias, nodeNum,
                    isLeaf, whichTree, ...)

    new("LinkDataFrame", df)
}

#-------------------------------------------------------------------------------
# TreeSummarizedExperiment
#-------------------------------------------------------------------------------
#' An S4 class TreeSummarizedExperiment
#'
#' The class \strong{TreeSummarizedExperiment} is an extension class of standard
#' \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment}} class. It has
#' four more slots that are not in
#' \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment}} class:
#' \code{rowTree}, \code{rowLinks} \code{colTree} and \code{colLinks}. The
#' hierarchical information of rows (columns) is stored in \code{rowTree}
#' (\code{colTree}) and the link between the rows (columns) of \code{assays}
#' tables and nodes of the tree is given in \code{rowLinks} (\code{colLinks}).
#'
#' @slot rowTree A phylo object or NULL. It gives information about the
#'   hiearchical structure of rows of \code{assays} tables.
#' @slot colTree A phylo object or NULL. It gives information about the
#'   hiearchical structure of columns of \code{assays} tables.
#' @slot rowLinks A LinkDataFrame. It gives information about the link between
#'   the nodes of the \code{rowTree} and the rows of \code{assays} tables.
#' @slot colLinks A LinkDataFrame. It gives information about the link between
#'   the nodes of the \code{colTree} and the columns of \code{assays} tables.
#' @slot referenceSeq A \code{DNAStringSet}/\code{DNAStringSetList} object or
#'   some object coercible to a \code{DNAStringSet}/\code{DNAStringSetList}
#'   object. See \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} for
#'   more details.
#' @slot ... Other slots from
#'   \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment}}
#'
#' @section Constructor:
#' See \code{\link{TreeSummarizedExperiment-constructor}} for constructor
#' functions.
#'
#' @section Accessor:
#' See \code{\link{TreeSummarizedExperiment-accessor}} for accessor functions.
#'
#' @details The class \strong{TreeSummarizedExperiment} is designed to store
#'   rectangular data for entities (e.g., microbes or cell types)
#'   (\code{assays}), information about the hiearchical structure
#'   (\code{rowTree} on rows; \code{colTree} on columns), and the mapping
#'   information between the tree nodes and the rows or the columns of the
#'   rectangular data. Users could provide the hiearchical structure of the
#'   rows, columns or both) of the \code{assays} tables, and the link data will
#'   be automatically generated in \code{rowLinks}, \code{colData} or both,
#'   respectively. It's required that the object in \code{rowLinks} or
#'   \code{colLinks} has the \code{LinkDataFrame} class. Please see the page
#'   \code{\link{LinkDataFrame}} for more details.
#'
#' @importFrom methods setClass
#' @importClassesFrom SingleCellExperiment SingleCellExperiment
#' @importClassesFrom S4Vectors DataFrame
#' @name TreeSummarizedExperiment-class
#' @seealso \code{\link{TreeSummarizedExperiment}}
#'   \code{\link{TreeSummarizedExperiment-accessor}}
#'   \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment}}
NULL

#' @importFrom Biostrings DNAStringSet DNAStringSetList
setClassUnion("DNAStringSetList_OR_DNAStringSet_OR_NULL",
              c("DNAStringSetList", "DNAStringSet","NULL"))

#' @name TreeSummarizedExperiment-class
#' @exportClass TreeSummarizedExperiment
setClass("TreeSummarizedExperiment",
         contains = "SingleCellExperiment",
         slots = c(rowTree = "list_Or_NULL",
                   colTree = "list_Or_NULL",
                   rowLinks = "LinkDataFrame_Or_NULL",
                   colLinks = "LinkDataFrame_Or_NULL",
                   referenceSeq = "DNAStringSetList_OR_DNAStringSet_OR_NULL"),
         prototype = list(referenceSeq = NULL))

#' @rdname TreeSummarizedExperiment-internal
#' @export
setMethod("vertical_slot_names", "TreeSummarizedExperiment",
          function(x) c("referenceSeq", callNextMethod())
)



# ==========================================================================
### Constructor
# ==========================================================================

#' Construct a TreeSummarizedExperiment object
#'
#' \code{TreeSummarizedExperiment} constructs a TreeSummarizedExperiment object.
#'
#' @inheritParams SingleCellExperiment::SingleCellExperiment
#' @param rowTree A phylo object that provides hiearchical information of rows
#'   of assay tables.
#' @param colTree A phylo object that provides hiearchical information of
#'   columns of assay tables.
#' @param rowNodeLab A character string. It provides the labels of nodes that
#'   the rows of \code{assays} tables corresponding to. If NULL (default), the
#'   row names of the \code{assays} tables are used.
#' @param colNodeLab A character string. It provides the labels of nodes that
#'   the columns of \code{assays} tables corresponding to. If NULL (default),
#'   the column names of the \code{assays} tables are used.
#' @param referenceSeq A \code{DNAStringSet}/\code{DNAStringSetList} object or
#'   some object coercible to a \code{DNAStringSet}/\code{DNAStringSetList}
#'   object. See \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} for
#'   more details.
#'
#' @details The output TreeSummarizedExperiment object has very similar
#'   structure as the
#'   \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment}}. The
#'   differences are summarized be as below.
#'   \itemize{
#'   \item \strong{rowTree} A slot exists in \code{TreeSummarizedExperiment}
#'   but not in \code{SingleCellExperiment}. It stores the tree structure(s)
#'   that provide(s) hierarchical information of \code{assays} rows or columns
#'   or both.
#'   \item \strong{rowData} If a \code{phylo} object is available in the slot
#'   \code{treeData} to provide the hiearchical information about the rows of
#'   the \code{assays} table, the \code{rowData} would be a
#'   \code{\link{LinkDataFrame-class}} instead of
#'   \code{\link[S4Vectors:DataFrame-class]{DataFrame}}. The data on the right side of the
#'   vertical line provides the link information between the \code{assays} rows
#'   and the tree \code{phylo} object, and could be accessed via
#'   \code{linkData}; The data on the left side is the original \code{rowData}
#'   like \code{SingleCellExperiment} object.
#'   \item \strong{colData} Similar to the explanaition for \strong{rowData} as
#'   above.
#'  }
#'  More details about the \code{LinkDataFrame} in the \code{rowData} or
#'  \code{colData}.
#'  \itemize{
#'  \item nodeLab The labels of nodes on the tree.
#'  \item nodeLab\_alias The alias of node labels on the tree.
#'  \item nodeNum The numbers of nodes on the tree.
#'  \item isLeaf It indicates whether the node is a leaf node or internal node.
#'  }
#'
#' @importFrom SummarizedExperiment assays colData<- rowData<-
#' @importFrom S4Vectors DataFrame
#' @importFrom methods new is as
#' @export
# #' @include classValid.R
#' @return a TreeSummarizedExperiment object
#' @name TreeSummarizedExperiment-constructor
#' @author Ruizhu HUANG
#' @seealso \code{\link[=TreeSummarizedExperiment-class]{TreeSummarizedExperiment}}
#'   \code{\link[=TreeSummarizedExperiment-accessor]{TreeSummarizedExperiment-accessor}}
#'   \code{\link[SingleCellExperiment:SingleCellExperiment]{SingleCellExperiment}}
#' @examples
#'
#' data("tinyTree")
#'
#' # the count table
#' count <- matrix(rpois(100, 50), nrow = 10)
#' rownames(count) <- c(tinyTree$tip.label)
#' colnames(count) <- paste("C_", 1:10, sep = "_")
#'
#' # The sample information
#' sampC <- data.frame(condition = rep(c("control", "trt"), each = 5),
#'                     gender = sample(x = 1:2, size = 10, replace = TRUE))
#' rownames(sampC) <- colnames(count)
#'
#' # build a TreeSummarizedExperiment object
#' tse <- TreeSummarizedExperiment(assays = list(count),
#'                                 colData = sampC,
#'                                 rowTree = tinyTree)
#'
TreeSummarizedExperiment <- function(..., rowTree = NULL, colTree = NULL,
                                     rowNodeLab = NULL, colNodeLab = NULL,
                                     referenceSeq = NULL) {

    if (!is.null(rowNodeLab)) {
        if (!is.character(rowNodeLab)) {
            stop("rowNodeLab should be a character vector", call. = FALSE)
        }
        if (is.null(rowTree)) {
            stop("rowTree is not available", call. = FALSE)
        }
    }

    if (!is.null(colNodeLab)) {
        if (!is.character(colNodeLab)) {
            stop("colNodeLab should be a character vector", call. = FALSE)
        }
        if (is.null(colTree)) {
            stop("colTree is not available", call. = FALSE)
        }
    }


    # -------------------------------------------------------------------------
    ## create a SummarizedExperiment object
    sce <- SingleCellExperiment(...)

    # -------------------------------------------------------------------------
    ## Indicate whether hiearchical information is available for rows and
    ## columns of assay tables
    isRow <- !is.null(rowTree)
    isCol <- !is.null(colTree)

    # # if not tree information are given, return a SingleCellExperiment
    # if(!isRow && !isCol){
    #     return(sce)
    # }

    # -------------------------------------------------------------------------
    ## check whether the input tree has the correct form
    if (isRow) {
        # the tree should be a phylo object
        if(!is(rowTree, "phylo")) {
            stop("A phylo object is required for the rowTree", "\n",
                 call. = FALSE)
        }

        # the tree should have unique leaf labels
        tipLab <- rowTree$tip.label
        anyDp <- any(duplicated(tipLab))
        if (anyDp) {
            stop("rowTree should have unique leaf labels. \n",
                 call. = FALSE)
        }
    }

    if (isCol) {
        # the tree should be a phylo object
        if(!is(colTree, "phylo")) {
            stop("A phylo object is required for the colTree", "\n",
                 call. = FALSE)
        }

        # the tree should have unique leaf labels
        tipLab <- colTree$tip.label
        anyDp <- any(duplicated(tipLab))
        if (anyDp) {
            stop("colTree should have unique leaf labels. \n",
                 call. = FALSE)
        }
    }

    # -------------------------------------------------------------------------
    ## create the link data
    tse <- new("TreeSummarizedExperiment",
               sce,
               referenceSeq = referenceSeq)

    # the rows:
    if (isRow) {
        rowData(tse) <- rowData(sce)
        out <- .linkFun(tree = rowTree, sce = sce,
                                 nodeLab = rowNodeLab, onRow = TRUE)
        tse <- tse[out$isKeep, ]
        tse@rowTree <- list(phylo = rowTree)
        tse@rowLinks <- out$link

    } else {
        tse@rowLinks <- NULL
    }


    # the columns:
    if (isCol) {
        colData(tse) <- colData(sce)
        out <- .linkFun(tree = colTree, sce = sce,
                        nodeLab = colNodeLab, onRow = FALSE)
        tse <- tse[, out$isKeep]
        tse@colTree <- list(phylo = colTree)
        tse@colLinks <- out$link

    } else {
        tse@colLinks <- NULL
    }


    return(tse)
}

# An internal function to create the link data 
.linkFun <- function(tree, sce, nodeLab, onRow = TRUE) {
    if (onRow) {
        annDat <- rowData(sce)
    } else {
        annDat <- colData(sce)
    }

    kw <- ifelse(onRow, "row", "column")

    # ------------------ labels from the tree ---------------------------------
    # the labels and the alias of the labels
    treeLab <- c(tree$tip.label, tree$node.label)
    nodeA <- unique(as.vector(tree$edge))
    treeLab_alias <- convertNode(tree = tree, node = nodeA,
                               use.alias = TRUE, message = FALSE)

    # ------------------ labels from the assays table -------------------------
    # The order to be used:nodeLab > the row names
    lab <- nodeLab

    # if nodeLab is not available, use the row names
    if (is.null(lab)) {
        lab <- rownames(annDat)
        if (is.null(lab)) {
            stop(kw, "NodeLab should be provided. \n", call. = FALSE)}
        }

    # decide whether treeLab or treeLab_alias should be used
    sw <- startsWith(lab, "alias_")
    sw <- all(sw)

    sw2 <- startsWith(treeLab, "alias_")
    notALL <- !all(sw2)
    if (any(sw2, na.rm = TRUE) & notALL) {
        warning("The tree has some node labels starting with 'alias_'. ",
                call. = FALSE)
    }

    # Match lab with the alias of the node labels on the tree
    # Unless all node labels start with 'alias_'
    if (sw & notALL) {
        isIn <- lab %in% treeLab_alias
    } else {
        isIn <- lab %in% treeLab
    }

    # Those with labels that don't match with the node labels of the tree are
    # excluded
    isOut <- !isIn
    if (sum(isOut) > 0) {
        warning(sum(isOut), " ", kw,
                "(s) couldn't be matched to the tree and are/is removed. \n",
                call. = FALSE)}

    if (onRow) {
        sce <- sce[isIn, ]
        rn <- rownames(sce)
    } else {
        sce <- sce[, isIn]
        rn <- colnames(sce)
    }


    # create the link data
    labN <- lab[isIn]
    nd <- convertNode(tree = tree, node = labN, use.alias = FALSE,
                    message = FALSE)
    fLab <- convertNode(tree = tree, node = nd, use.alias = FALSE,
                      message = FALSE)
    faLab <- convertNode(tree = tree, node = nd, use.alias = TRUE,
                       message = FALSE)
    leaf <- unique(setdiff(tree$edge[, 2], tree$edge[, 1]))

    if (sum(isIn)) {
        ind_tree <- "phylo"
    } else {
        ind_tree <- character(0)
    }
    linkD <- LinkDataFrame(nodeLab = fLab,
                           nodeLab_alias = faLab,
                           nodeNum = nd,
                           isLeaf = nd %in% leaf,
                           whichTree = ind_tree)

    rownames(linkD) <- rn

    out <- list(link = linkD, isKeep = isIn)
    return(out)
}
fionarhuang/TreeSummarizedExperiment documentation built on Aug. 18, 2021, 12:15 p.m.