benchdamic: Benchmark of differential abundance methods on microbiome data

Documented in createColors extractDA extractStatistics get_counts_metadata getDA getPositives getStatistics iterative_ordering

#' @title getStatistics
#'
#' @export
#' @description
#' Extract the list of p-values or/and log fold changes from the output of a
#' differential abundance detection method.
#'
#' @param method Output of a differential abundance detection method.
#' \code{pValMat}, \code{statInfo} matrices, and method's \code{name} must be
#' present (See vignette for detailed information).
#' @param slot The slot name where to extract values
#' (default \code{slot = "pValMat"}).
#' @param colName The column name of the slot where to extract values
#' (default \code{colName = "rawP"}).
#' @param type The value type of the column selected where to extract values.
#' Two values are possible: \code{"pvalue"} or \code{"logfc"}
#' (default \code{type = "pvalue"}).
#' @param direction \code{statInfo}'s column name containing information about
#' the signs of differential abundance (usually log fold changes)
#' (default \code{direction = NULL}).
#' @param verbose Boolean to display the kind of extracted values
#' (default \code{verbose = FALSE}).
#'
#' @return A vector or a \code{data.frame}. If \code{direction = NULL},
#' the \code{colname} column values, transformed according to \code{type} (not
#' tranformed if \code{type = "pvalue"}, \code{-abs(value)} if
#' \code{type = "logfc"}), of the \code{slot} are reported, otherwise the
#' \code{direction} column of the \code{statInfo} matrix is added to the output.
#'
#' @seealso \code{\link{extractStatistics}}
#'
#' @examples
#' data("ps_plaque_16S")
#' # Add scaling factors
#' ps_plaque_16S <- norm_edgeR(object = ps_plaque_16S, method = "TMM")
#' # DA analysis
#' da.limma <- DA_limma(
#'     object = ps_plaque_16S,
#'     design = ~ 1 + HMP_BODY_SUBSITE,
#'     coef = 2,
#'     norm = "TMM"
#' )
#' # get p-values
#' getStatistics(
#'     method = da.limma, slot = "pValMat", colName = "rawP",
#'     type = "pvalue", direction = NULL
#' )
#' # get negative abs(logFC) values
#' getStatistics(
#'     method = da.limma, slot = "statInfo", colName = "logFC",
#'     type = "logfc", direction = NULL
#' )
#' # get p-values and logFC
#' getStatistics(
#'     method = da.limma, slot = "pValMat", colName = "rawP",
#'     type = "pvalue", direction = "logFC"
#' )
getStatistics <- function(method, slot = "pValMat", colName = "rawP",
                          type = "pvalue", direction = NULL, verbose = FALSE) {
    # Info extraction
    method_name <- method[["name"]]
    if (!is.element(slot, names(method))) {
        stop("'", slot, "' slot not found for ", method_name)
    } else {
        info <- method[[slot]]
    }
    if (!is.element(colName, colnames(info))) {
        stop("'", colName, "' column not found in '", slot, "' slot for ",
            method_name)
    } else {
        info_col <- info[, colName]
    }
    names(info_col) <- rownames(info)
    msg <- paste0("\nMethod: ", method_name)
    # Output vector
    if (type == "pvalue") {
        msg <- paste0(msg, "\n * '", colName, "'")
        out <- info_col # [!is.na(info_col) & info_col < 1]
    } else if (type == "logfc") {
        msg <- paste0(msg, "\n * -|", colName, "|")
        out <- -abs(info_col) # [!is.na(info_col)])
    } else {
        stop("Please choose between type: pvalue or logfc.")
    }
    msg <- paste0(
        msg, " column, as ", type, " type, of ", slot,
        " matrix."
    )
    # Check if direction is not null
    if (!is.null(direction)) {
        msg <- paste0(
            msg, "\n * '", direction,
            "' column, as direction, of statInfo matrix."
        )
        # Extract statInfo
        statInfo <- method[["statInfo"]]
        if (!is.element(direction, colnames(statInfo))) {
            stop(direction, " column not found for ", method_name)
        } else {
            out <- data.frame(out, statInfo[, direction])
            colnames(out) <- c(colName, direction)
        }
    }
    if (verbose) {
        message(msg, appendLF = FALSE)
    }
    return(out)
}

#' @title extractStatistics
#'
#' @export
#' @description
#' Extract the list of p-values or/and log fold changes from the outputs of the
#' differential abundance detection methods.
#'
#' @param object Output of differential abundance detection methods.
#' \code{pValMat}, \code{statInfo} matrices, and method's \code{name} must be
#' present (See vignette for detailed information).
#' @param slot A character vector with 1 or number-of-methods-times repeats of
#' the slot names where to extract values for each method
#' (default \code{slot = "pValMat"}).
#' @param colName A character vector with 1 or number-of-methods-times repeats
#' of the column name of the slot where to extract values for each method
#' (default \code{colName = "rawP"}).
#' @param type A character vector with 1 or number-of-methods-times repeats
#' of the value type of the column selected where to extract values for each
#' method. Two values are possible: \code{"pvalue"} or \code{"logfc"}
#' (default \code{type = "pvalue"}).
#' @param direction A character vector with 1 or number-of-methods-times repeats
#' of the \code{statInfo}'s column name containing information about the signs
#' of differential abundance (usually log fold changes) for each method
#' (default \code{direction = NULL}).
#' @param verbose Boolean to display the kind of extracted values
#' (default \code{verbose = FALSE}).
#'
#' @return A vector or a \code{data.frame} for each method. If
#' \code{direction = NULL}, the \code{colname} column values, transformed
#' according to \code{type} (not tranformed if \code{type = "pvalue"},
#' \code{-abs(value)} if \code{type = "logfc"}), of the \code{slot} are reported
#' , otherwise the \code{direction} column of the \code{statInfo} matrix is
#' added to the output.
#'
#' @seealso \code{\link{getStatistics}}
#'
#' @examples
#' data("ps_plaque_16S")
#' # Add scaling factors
#' my_norm <- setNormalizations(fun = c("norm_edgeR", "norm_CSS"),
#'     method = c("TMM", "CSS"))
#' ps_plaque_16S <- runNormalizations(normalization_list = my_norm,
#'     object = ps_plaque_16S)
#' # Perform DA analysis
#' my_methods <- set_limma(design = ~ 1 + HMP_BODY_SUBSITE, coef = 2,
#'     norm = c("TMM", "CSS"))
#' Plaque_16S_DA <- runDA(method_list = my_methods, object = ps_plaque_16S)
#' ### Extract statistics for concordance analysis:
#' # Only p-values
#' extracted_pvalues <- extractStatistics(
#'     object = Plaque_16S_DA, slot =
#'         "pValMat", colName = "rawP", type = "pvalue"
#' )
#' # Only transformed log fold changes -abs(logFC)
#' extracted_abslfc <- extractStatistics(
#'     object = Plaque_16S_DA, slot =
#'         "statInfo", colName = "logFC", type = "logfc"
#' )
#' # Only transformed log fold changes for a method and p-values for the other
#' extracted_abslfc_pvalues <- extractStatistics(
#'     object = Plaque_16S_DA,
#'     slot = c("statInfo", "pValMat"), colName = c("logFC", "rawP"), type =
#'         c("logfc", "pvalue")
#' )
#' ### Extract statistics for enrichment analysis:
#' # p-values and log fold changes
#' extracted_pvalues_and_lfc <- extractStatistics(
#'     object = Plaque_16S_DA,
#'     slot = "pValMat", colName = "rawP", type = "pvalue", direction = "logFC"
#' )
#' # transformed log fold changes and untouched log fold changes
#' extracted_abslfc_and_lfc <- extractStatistics(
#'     object = Plaque_16S_DA,
#'     slot = "statInfo", colName = "logFC", type = "logfc", direction =
#'         "logFC"
#' )
#' # Only transformed log fold changes for a method and p-values for the other
#' extracted_mix <- extractStatistics(
#'     object = Plaque_16S_DA,
#'     slot = c("statInfo", "pValMat"), colName = c("logFC", "rawP"), type =
#'         c("logfc", "pvalue"), direction = "logFC"
#' )
extractStatistics <- function(object, slot = "pValMat", colName = "rawP",
    type = "pvalue", direction = NULL, verbose = FALSE) {
    n_methods <- length(object)
    # Check the dimension of slot, colName, and type.
    if (length(slot) == 1) {
        slot <- rep(slot, n_methods)
    }
    if (length(colName) == 1) {
        colName <- rep(colName, n_methods)
    }
    if (length(type) == 1) {
        type <- rep(type, n_methods)
    }
    # Error if they have unequal lengths
    if (length(slot) != n_methods | length(colName) != n_methods |
        length(type) != n_methods) {
        stop("Unequal lengths for slot, colName, or type arguments.")
    }
    # Check if direction is defined
    if (!is.null(direction)) {
        if (length(direction) == 1) {
            direction <- rep(direction, n_methods)
        }
        if (length(direction) != n_methods) {
            stop("Wrong length for direction argument.")
        }
    }
    # Rename method names
    names(object) <- unlist(lapply(object, function(method) method[["name"]]))
    if (is.null(direction)) {
        out <- mapply(getStatistics,
            method = object, slot = slot,
            colName = colName, type = type, MoreArgs = list(
                direction = NULL,
                verbose = verbose
            ), SIMPLIFY = FALSE
        )
    } else {
        out <- mapply(getStatistics,
            method = object, slot = slot,
            colName = colName, type = type, direction = direction, MoreArgs =
                list(verbose = verbose), SIMPLIFY = FALSE
        )
    }
    return(out)
}

#' @title getDA
#'
#' @export
#' @description
#' Inspect the list of p-values or/and log fold changes from the output of a
#' differential abundance detection method.
#'
#' @inheritParams getStatistics
#' @param threshold_pvalue Threshold value for p-values. If present, features
#' with p-values lower than \code{threshold_pvalue} are considered
#' differentially abundant. Set \code{threshold_pvalue = 1} to not filter by
#' p-values.
#' @param threshold_logfc Threshold value for log fold changes. If present,
#' features with log fold change absolute values higher than
#' \code{threshold_logfc} are considered differentially abundant. Set
#' \code{threshold_logfc = 0} to not filter by log fold change values.
#' @param top If not null, the \code{top} number of features, ordered by
#' p-values or log fold change values, are considered as differentially
#' abundant (default \code{top = NULL}).
#' @param verbose Boolean to display the kind of extracted values
#' (default \code{verbose = FALSE}).
#'
#' @return A \code{data.frame} with several columns:
#' \itemize{
#'     \item{\code{stat}}{ which contains the p-values or the absolute log fold
#'     change values;}
#'     \item{\code{direction}}{ which is present if \code{method} was a
#'     \code{data.frame} object, it contains the information about
#'     directionality of differential abundance (usually log fold changes);}
#'     \item{\code{DA}}{ which can contain several values according to
#'     thresholds and inputs. \code{"DA"} or \code{"non-DA"} if \code{method}
#'     object was a vector, \code{"UP Abundant"}, \code{"DOWN Abundant"}, or
#'     \code{"non-DA"} if \code{method} was a \code{data.frame}.}}
#'
#' @seealso \code{\link{getStatistics}}, \code{\link{extractDA}}
#'
#' @examples
#' data("ps_plaque_16S")
#' # Add scaling factors
#' ps_plaque_16S <- norm_edgeR(object = ps_plaque_16S, method = "TMM")
#' # DA analysis
#' da.limma <- DA_limma(
#'     object = ps_plaque_16S,
#'     design = ~ 1 + HMP_BODY_SUBSITE,
#'     coef = 2,
#'     norm = "TMM"
#' )
#' # features with p-value < 0.1 as DA
#' getDA(
#'     method = da.limma, slot = "pValMat", colName = "rawP", type = "pvalue",
#'     direction = NULL, threshold_pvalue = 0.1, threshold_logfc = 0,
#'     top = NULL
#' )
#' # top 10 feature with highest logFC are DA
#' getDA(
#'     method = da.limma, slot = "pValMat", colName = "rawP", type = "pvalue",
#'     direction = "logFC", threshold_pvalue = 1, threshold_logfc = 0, top = 10
#' )
#' # features with p-value < 0.1 and |logFC| > 1 are DA
#' getDA(
#'     method = da.limma, slot = "pValMat", colName = "rawP", type = "pvalue",
#'     direction = "logFC", threshold_pvalue = 0.1, threshold_logfc = 1, top =
#'         NULL
#' )
#' # top 10 features with |logFC| > 1 are DA
#' getDA(
#'     method = da.limma, slot = "pValMat", colName = "rawP", type = "pvalue",
#'     direction = "logFC", threshold_pvalue = 1, threshold_logfc = 1, top = 10
#' )
getDA <- function(method, slot = "pValMat", colName = "rawP", type = "pvalue",
    direction = NULL, threshold_pvalue = 1, threshold_logfc = 0, top = NULL,
    verbose = FALSE) {
    out <- data.frame(getStatistics(
        method = method, slot = slot, colName =
            colName, type = type, direction = direction, verbose = verbose
    ))
    out[, "DA"] <- "non-DA"
    msg <- "\n * DA: "
    if (is.null(top)) {
        top <- nrow(out)
    } else {
        msg <- paste0(msg, "top ", top, " features")
    }
    # Only one between the p-values or -|lfc| were supplied
    if (is.null(direction)) {
        colnames(out) <- c("stat", "DA")
        if (type == "pvalue") { # p-value <= threshold_pvalue are DA
            msg <- paste0(msg, ", ", colName, "<=", threshold_pvalue, "\n")
            index_DA <- which(out[, "stat"] <= threshold_pvalue)
            DA_taxa <- rownames(out)[index_DA]
            out_DA <- out[DA_taxa, ]
            ordered_out <- out_DA[order(out_DA[, "stat"]), ]
        } else if (type == "logfc") { # -|lfc| <= -threshold_logfc are DA
            msg <- paste0(msg, ", |", colName, "|>=", threshold_logfc)
            index_DA <- which(out[, "stat"] <= -threshold_logfc, "\n")
            DA_taxa <- rownames(out)[index_DA]
            out_DA <- out[DA_taxa, ]
            ordered_out <- out_DA[order(-abs(out_DA[, "stat"])), ]
        } else {
            stop("type should be one between 'pvalue' or 'logfc'")
        }
        DA_taxa <- rownames(ordered_out)[seq_len(min(length(DA_taxa), top))]
        if (length(index_DA) != 0) { # Check if all non-DA
            out[DA_taxa, "DA"] <- "DA"
        }
        # p-values and lfc, or -|lfc| and lfc were supplied
    } else {
        colnames(out) <- c("stat", "direction", "DA")
        if (type == "pvalue") { # Check if the first column is a p-value
            msg <- paste0(
                msg, ", ", colName, "<=", threshold_pvalue,
                ", |", direction, "|>=", threshold_logfc, "\n"
            )
            index_DA <- intersect(
                which(out[, "stat"] <= threshold_pvalue),
                which(abs(out[, "direction"]) >= threshold_logfc)
            )
            DA_taxa <- rownames(out)[index_DA]
            out_DA <- out[DA_taxa, ]
            ordered_out <- out_DA[order(out_DA[, "stat"]), ]
        } else if (type == "logfc") { # If the first column is a -|lfc|
            msg <- paste0(msg, ", |", colName, "|>=", threshold_logfc, "\n")
            index_DA <- which(out[, "stat"] <= -threshold_logfc)
            DA_taxa <- rownames(out)[index_DA]
            out_DA <- out[DA_taxa, ]
            ordered_out <- out_DA[order(-abs(out_DA[, "stat"])), ]
        } else {
            stop("type should be one between 'pvalue' or 'logfc'")
        }
        DA_taxa <- rownames(ordered_out)[seq_len(min(length(DA_taxa), top))]
        if (length(index_DA) != 0) { # Check if all non-DA
            out[DA_taxa, "DA"] <- ifelse(test = sign(out[DA_taxa, "direction"])
            == 1, yes = "UP Abundant", no = "DOWN Abundant")
        }
    }
    if (verbose) {
        message(msg, appendLF = FALSE)
    }
    return(out)
}

#' @title extractDA
#'
#' @export
#' @description
#' Inspect the list of p-values or/and log fold changes from the output of
#' differential abundance detection methods.
#'
#' @inheritParams extractStatistics
#' @param threshold_pvalue A single or a numeric vector of thresholds for
#' p-values. If present, features with p-values lower than
#' \code{threshold_pvalue} are considered differentially abundant. Set
#' \code{threshold_pvalue = 1} to not filter by p-values.
#' @param threshold_logfc A single or a numeric vector of thresholds for log
#' fold changes. If present, features with log fold change absolute values
#' higher than \code{threshold_logfc} are considered differentially abundant.
#' Set \code{threshold_logfc = 0} to not filter by log fold change values.
#' @param top If not null, the \code{top} number of features, ordered by
#' p-values or log fold change values, are considered as differentially
#' abundant (default \code{top = NULL}).
#' @param verbose Boolean to display the kind of extracted values
#' (default \code{verbose = FALSE}).
#'
#' @return A \code{data.frame} with several columns for each method:
#' \itemize{
#'     \item{\code{stat}}{ which contains the p-values or the absolute log fold
#'     change values;}
#'     \item{\code{direction}}{ which is present if \code{direction} was
#'     supplied, it contains the information about directionality of
#'     differential abundance (usually log fold changes);}
#'     \item{\code{DA}}{ which can contain several values according to
#'     thresholds and inputs. \code{"DA"} or \code{"non-DA"} if
#'     \code{direction = NULL}, \code{"UP Abundant"}, \code{"DOWN Abundant"}, or
#'     \code{"non-DA"} otherwise.}}
#'
#' @seealso \code{\link{getDA}}, \code{\link{extractStatistics}}
#'
#' @examples
#' data("ps_plaque_16S")
#' # Add scaling factors
#' my_norm <- setNormalizations(fun = c("norm_edgeR", "norm_CSS"),
#'     method = c("TMM", "CSS"))
#' ps_plaque_16S <- runNormalizations(normalization_list = my_norm,
#'     object = ps_plaque_16S)
#' # Perform DA analysis
#' my_methods <- set_limma(design = ~ 1 + HMP_BODY_SUBSITE, coef = 2,
#'     norm = c("TMM", "CSS"))
#' Plaque_16S_DA <- runDA(method_list = my_methods, object = ps_plaque_16S)
#' # Top 10 features (ordered by 'direction') are DA
#' DA_1 <- extractDA(
#'     object = Plaque_16S_DA, slot = "pValMat", colName = "adjP",
#'     type = "pvalue", direction = "logFC", threshold_pvalue = 1,
#'     threshold_logfc = 0, top = 10
#' )
#' # Features with p-value < 0.05 and |logFC| > 1 are DA
#' DA_2 <- extractDA(
#'     object = Plaque_16S_DA, slot = "pValMat", colName = "adjP",
#'     type = "pvalue", direction = "logFC", threshold_pvalue = 0.05,
#'     threshold_logfc = 1, top = NULL
#' )
extractDA <- function(object, slot = "pValMat", colName = "adjP", 
    type = "pvalue", direction = NULL, threshold_pvalue = 1, 
    threshold_logfc = 0, top = NULL, verbose = FALSE) {
    if (is.null(direction)) {
        if (is.null(top)) {
            out <- mapply(getDA,
                method = object, slot = slot,
                colName = colName, type = type,
                threshold_pvalue = threshold_pvalue,
                threshold_logfc = threshold_logfc, MoreArgs = list(
                    direction =
                        direction, top = top, verbose = verbose
                ), SIMPLIFY = FALSE
            )
        } else {
            out <- mapply(getDA,
                method = object, slot = slot,
                colName = colName, type = type,
                threshold_pvalue = threshold_pvalue,
                threshold_logfc = threshold_logfc, top = top, MoreArgs = list(
                    direction = direction, verbose = verbose
                ), SIMPLIFY = FALSE
            )
        }
    } else {
        if (is.null(top)) {
            out <- mapply(getDA,
                method = object, slot = slot,
                colName = colName, type = type, direction = direction,
                threshold_pvalue = threshold_pvalue,
                threshold_logfc = threshold_logfc, MoreArgs =
                    list(top = top, verbose = verbose), SIMPLIFY = FALSE
            )
        } else {
            out <- mapply(getDA,
                method = object, slot = slot,
                colName = colName, type = type, direction = direction,
                threshold_pvalue = threshold_pvalue,
                threshold_logfc = threshold_logfc, top = top,
                MoreArgs = list(verbose = verbose), SIMPLIFY = FALSE
            )
        }
    }
    names(out) <- unlist(lapply(object, function(method) {
        method[["name"]]
    }))
    return(out)
}

#' @title getPositives
#'
#' @export
#' @description
#' Inspect the list of p-values or/and log fold changes from the output of a
#' differential abundance detection method and count the True Positives (TP) and
#' the False Positives (FP).
#'
#' @param method Output of differential abundance detection method in which
#' DA information is extracted by the \code{getDA} function, information
#' related to enrichment is appropriately added through the \code{addKnowledge}
#' function and the Fisher exact tests is performed for the contingency tables
#' by the \code{enrichmentTests} function.
#' @inheritParams addKnowledge
#' @param TP A list of length-2 vectors. The entries in the vector are the
#' direction ("UP Abundant", "DOWN Abundant", or "non-DA") in the first
#' position, and the level of the enrichment variable (\code{enrichmentCol})
#' which is expected in that direction, in the second position.
#' @param FP A list of length-2 vectors. The entries in the vector are the
#' direction ("UP Abundant", "DOWN Abundant", or "non-DA") in the first
#' position, and the level of the enrichment variable (\code{enrichmentCol})
#' which is not expected in that direction, in the second position.
#'
#' @return A named vector containing the number of TPs and FPs.
#'
#' @seealso \code{\link{createPositives}}.
#'
#' @examples
#' data("ps_plaque_16S")
#' data("microbial_metabolism")
#' # Extract genera from the phyloseq tax_table slot
#' genera <- phyloseq::tax_table(ps_plaque_16S)[, "GENUS"]
#' # Genera as rownames of microbial_metabolism data.frame
#' rownames(microbial_metabolism) <- microbial_metabolism$Genus
#' # Match OTUs to their metabolism
#' priorInfo <- data.frame(genera,
#'     "Type" = microbial_metabolism[genera, "Type"]
#' )
#' # Unmatched genera becomes "Unknown"
#' unknown_metabolism <- is.na(priorInfo$Type)
#' priorInfo[unknown_metabolism, "Type"] <- "Unknown"
#' priorInfo$Type <- factor(priorInfo$Type)
#' # Add a more informative names column
#' priorInfo[, "newNames"] <- paste0(rownames(priorInfo), priorInfo[, "GENUS"])
#'
#' # DA Analysis
#' # Add scaling factors
#' ps_plaque_16S <- norm_edgeR(object = ps_plaque_16S, method = "TMM")
#' # DA analysis
#' da.limma <- DA_limma(
#'     object = ps_plaque_16S,
#'     design = ~ 1 + HMP_BODY_SUBSITE,
#'     coef = 2,
#'     norm = "TMM"
#' )
#'
#' DA <- getDA(
#'     method = da.limma, slot = "pValMat", colName = "adjP",
#'     type = "pvalue", direction = "logFC", threshold_pvalue = 0.05,
#'     threshold_logfc = 1, top = NULL
#' )
#' # Add a priori information
#' DA_info <- addKnowledge(
#'     method = DA, priorKnowledge = priorInfo,
#'     enrichmentCol = "Type", namesCol = "newNames"
#' )
#' # Create contingency tables and compute F tests
#' DA_info_enriched <- enrichmentTest(
#'     method = DA_info, enrichmentCol = "Type",
#'     alternative = "greater"
#' )
#' # Count True and False Positives
#' DA_TP_FP <- getPositives(
#'     method = DA_info_enriched, enrichmentCol = "Type",
#'     TP = list(c("UP Abundant", "Aerobic"), c("DOWN Abundant", "Anaerobic")),
#'     FP = list(c("UP Abundant", "Anaerobic"), c("DOWN Abundant", "Aerobic"))
#' )
getPositives <- function(method, enrichmentCol, TP, FP) {
    data <- method[["data"]][, c("DA", enrichmentCol)]
    # contingency table
    tab <- table(data)
    # True Positives
    TPs <- unlist(lapply(X = TP, FUN = function(x) {
        if (is.element(el = x[1], set = rownames(tab)) &
            is.element(el = x[2], set = colnames(tab))) {
            return(tab[x[1], x[2]])
        } else {
            return(0)
        }
    }))
    # False Positives
    FPs <- unlist(lapply(X = FP, FUN = function(x) {
        if (is.element(el = x[1], set = rownames(tab)) &
            is.element(el = x[2], set = colnames(tab))) {
            return(tab[x[1], x[2]])
        } else {
            return(0)
        }
    }))
    TP_tot <- sum(TPs)
    FP_tot <- sum(FPs)
    return(c("TP" = TP_tot, "FP" = FP_tot))
}

#' @title createColors
#'
#' @export
#' @importFrom RColorBrewer brewer.pal.info brewer.pal
#' @description
#' Produce a qualitative set of colors.
#'
#' @param variable character vector or factor variable.
#'
#' @return A named vector containing the color codes.
#'
#' @examples
#' # Given qualitative variable
#' cond <- factor(c("A", "A", "B", "B", "C", "D"),
#'     levels = c("A", "B", "C", "D"))
#'
#' # Associate a color to each level (or unique value, if not a factor)
#' cond_colors <- createColors(cond)
createColors <- function(variable) {
    if (is.factor(variable)) {
        levels <- levels(variable)
    } else {
        levels <- unique(variable)
    }
    n_levels <- length(levels)
    if (n_levels > 74) {
          stop("To many levels to color them differently.")
      }
    pal.info <- RColorBrewer::brewer.pal.info
    qual_col_pals <- pal.info[pal.info$category == "qual", ]
    col_vector <- unlist(mapply(
        RColorBrewer::brewer.pal,
        qual_col_pals$maxcolors, rownames(qual_col_pals)
    ))
    cols <- col_vector[seq_len(n_levels)]
    names(cols) <- levels
    return(cols)
} # END - function: createColors

#' @title iterativeOrdering
#'
#' @export
#' @importFrom plyr ddply
#' @description Turn the chosen columns (factor) of the input \code{data.frame}
#' into ordered factors. For each factor, the order is given by the number of
#' elements in each level of that factor.
#'
#' @param df a \code{data.frame} object.
#' @param var_names character vector containing the names of one or more columns
#' of \code{df}.
#' @param i iteration index (default \code{i = 1}).
#' @param decreasing logical value or a vector of them. Each value should be
#' associated to a \code{var_name} vector's element. Should the sort order be
#' increasing or decreasing?
#'
#' @return the input \code{data.frame} with the \code{var_names} variables as
#' ordered factors.
#'
#' @seealso \code{\link{plotMutualFindings}}
#'
#' @examples
#' # From a dataset with some factor columns
#' mpg <- data.frame(ggplot2::mpg)
#' # Order the levels of the desired factors based on the cardinality of each
#' # level.
#' ordered_mpg <- iterative_ordering(df = mpg,
#'    var_names = c("manufacturer", "model"),
#'    decreasing = c(TRUE, TRUE))
#' # Now the levels of the factors are ordered in a decreasing way
#' levels(ordered_mpg$manufacturer)
#' levels(ordered_mpg$model)

iterative_ordering <- function(df, var_names, i = 1, decreasing = TRUE) {
    if (length(decreasing) == 1) {
        decreasing <- rep(decreasing, length(var_names))
    } else {
        if (length(decreasing) != length(var_names)) {
              stop("Wrong length of 'decreasing' vector\n")
          }
    }
    # Count elements
    df_var <- plyr::ddply(df, .variables = var_names[i], nrow)
    # Order levels
    ord <- df_var[
        order(df_var[, "V1"], decreasing = decreasing[i]),
        var_names[i]
    ]
    # Rebuild data.frame with the ordered factor
    df[, var_names[i]] <- factor(
        x = df[, var_names[i]], levels = ord,
        ordered = TRUE
    )
    if (i < length(var_names)) { # Iterative
          iterative_ordering(
              df = df, var_names = var_names, i = i + 1,
              decreasing = decreasing
          )
      } else {
        return(df)
    }
}

#' @title get_counts_metadata
#'
#' @export
#' @importFrom phyloseq otu_table sample_data
#' @importFrom SummarizedExperiment assay colData
#' @importFrom TreeSummarizedExperiment TreeSummarizedExperiment
#' @description Check whether the input object is a phyloseq or a 
#' TreeSummarizedExperiment, then extract the requested data slots.
#'
#' @param object a phyloseq or TreeSummarizedExperiment object.
#' @param assay_name the name of the assay to extract from the 
#' TreeSummarizedExperiment object (default \code{assayName = "counts"}). Not 
#' used if the input object is a phyloseq.
#' @param min_counts Parameter to filter taxa. Set this number to keep features 
#' with more than \code{min_counts} counts in more than \code{min_samples} 
#' samples (default \code{min_counts = 0}).
#' @param min_samples Parameter to filter taxa. Set this number to keep 
#' features with a \code{min_counts} counts in more than \code{min_samples} 
#' samples (default \code{min_samples = 0}).
#'
#' @return a \code{list} of results:
#' \itemize{
#'     \item{\code{counts}}{the \code{otu_table} slot or \code{assayName} assay 
#'     of the phyloseq or TreeSummarizedExperiment object;}
#'     \item{\code{metadata}}{the \code{sample_data} or \code{colData} slot of
#'     the phyloseq or TreeSummarizedExperiment object;}
#'     \item{\code{is_phyloseq}}{a boolean equal to \code{TRUE} if the input is 
#'     a phyloseq object.}}
#'
#' @examples
#' set.seed(1)
#' # Create a very simple phyloseq object
#' counts <- matrix(rnbinom(n = 60, size = 3, prob = 0.5), nrow = 10, ncol = 6)
#' metadata <- data.frame("Sample" = c("S1", "S2", "S3", "S4", "S5", "S6"),
#'                        "group" = as.factor(c("A", "A", "A", "B", "B", "B")))
#' ps <- phyloseq::phyloseq(phyloseq::otu_table(counts, taxa_are_rows = TRUE),
#'                          phyloseq::sample_data(metadata))
#' get_counts_metadata(ps)
#' 
#' # Or with a TreeSummarizedExperiment
#' tse <- TreeSummarizedExperiment::TreeSummarizedExperiment(
#'     assays = list("counts" = counts), colData = metadata)
#' get_counts_metadata(tse)

get_counts_metadata <- function(object, assay_name = "counts", min_counts = 0,
    min_samples = 0){
    is_phyloseq <- FALSE
    # In case of phyloseq
    if(is(object, "phyloseq")){
        is_phyloseq <- TRUE
        counts <- as(phyloseq::otu_table(object), "matrix")
        metadata <- data.frame(phyloseq::sample_data(object))
        if (!phyloseq::taxa_are_rows(object)){
            counts <- t(counts)
        }
        if(assay_name != "counts"){
            warning("The 'assay_name' = ", assay_name, " but the 'object' is", 
                " a phyloseq. Please supply a TreeSummarizedExperiment", 
                " object to use that assay.")
        }
    # In case of TreeSummarizedExperiment
    } else if (is(object, "TreeSummarizedExperiment")){
        if(!is.element(assay_name, SummarizedExperiment::assayNames(object))){
            stop(assay_name, " is not an assay of the",
                " TreeSummarizedExperiment object.")
        } else {
            counts <- SummarizedExperiment::assay(object, assay_name)
            metadata <- data.frame(SummarizedExperiment::colData(object))
        }
    } else stop("Please, supply a phyloseq or TreeSummarizedExperiment object.")
    if(min_counts != 0 | min_samples != 0){
        warning("Keeping taxa with more than ", min_counts, 
            " counts in more than ", min_samples, " samples.")
    }
    taxa_to_keep <- apply(counts, 1, function(x) 
        sum(x > min_counts) > min_samples)
    counts <- counts[taxa_to_keep, ]
    return(list("counts" = counts, "metadata" = metadata, 
        "is_phyloseq" = is_phyloseq))
}
mcalgaro93/benchdamic documentation built on Nov. 28, 2024, 2:16 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mcalgaro93/benchdamic
Benchmark of differential abundance methods on microbiome data

R/utilities.R
In mcalgaro93/benchdamic: Benchmark of differential abundance methods on microbiome data

Defines functions get_counts_metadata iterative_ordering createColors getPositives extractDA getDA extractStatistics getStatistics

Documented in createColors extractDA extractStatistics get_counts_metadata getDA getPositives getStatistics iterative_ordering

R Package Documentation

Browse R Packages

We want your feedback!

mcalgaro93/benchdamic Benchmark of differential abundance methods on microbiome data

R/utilities.R In mcalgaro93/benchdamic: Benchmark of differential abundance methods on microbiome data

Defines functions get_counts_metadata iterative_ordering createColors getPositives extractDA getDA extractStatistics getStatistics

Documented in createColors extractDA extractStatistics get_counts_metadata getDA getPositives getStatistics iterative_ordering

R Package Documentation

Browse R Packages

We want your feedback!

mcalgaro93/benchdamic
Benchmark of differential abundance methods on microbiome data

R/utilities.R
In mcalgaro93/benchdamic: Benchmark of differential abundance methods on microbiome data