ADImpute: Adaptive Dropout Imputer (ADImpute)

Documented in EvaluateMethods Impute

# ADImpute predicts unmeasured gene expression values from single cell
# RNA-sequencing data (dropout imputation). This R-package combines multiple
# dropout imputation methods, including a novel gene regulatory
# network-based method.  Copyright (C) 2020 Ana Carolina Leote This program
# is free software: you can redistribute it and/or modify it under the terms
# of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.  This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
# Public License for more details.  You should have received a copy of the
# GNU General Public License along with this program.  If not, see
# <https://www.gnu.org/licenses/>.


#' @title Imputation method evaluation on training set
#'
#' @usage EvaluateMethods(data, sce = NULL, do = c('Baseline', 'DrImpute',
#' 'Network'), write = FALSE, train.ratio = .7, train.only = TRUE,
#' mask.ratio = .1, outdir = getwd(), scale = 1, pseudo.count = 1,
#' labels = NULL, cell.clusters = 2, drop_thre = NULL, type = 'count',
#' cores = BiocParallel::bpworkers(BPPARAM),
#' BPPARAM = BiocParallel::SnowParam(type = "SOCK"),
#' net.coef = ADImpute::network.coefficients, net.implementation = 'iteration',
#' tr.length = ADImpute::transcript_length, bulk = NULL, ...)
#'
#' @description \code{EvaluateMethods} returns the best-performing imputation
#' method for each gene in the dataset
#'
#' @param data matrix; normalized counts, not logged (genes as rows and samples
#' as columns)
#' @param sce SingleCellExperiment; normalized counts and associated metadata.
#' @param do character; choice of methods to be used for imputation. Currently
#' supported methods are \code{'Baseline'}, \code{'DrImpute'} and
#' \code{'Network'}. Not case-sensitive. Can include one or more methods. Non-
#' supported methods will be ignored.
#' @param write logical; write intermediary and imputed objects to files?
#' @param train.ratio numeric; ratio of samples to be used for training
#' @param train.only logical; if TRUE define only a training dataset, if
#' FALSE writes and returns both training and validation sets (defaults to TRUE)
#' @param mask.ratio numeric; ratio of samples to be masked per gene
#' @param outdir character; path to directory where output files are written.
#' Defaults to working directory
#' @param scale integer; scaling factor to divide all expression levels by
#' (defaults to 1)
#' @param pseudo.count integer; pseudo-count to be added to expression levels
#' to avoid log(0) (defaults to 1)
#' @param labels character; vector specifying the cell type of each column of
#' \code{data}
#' @param cell.clusters integer; number of cell subpopulations
#' @param drop_thre numeric; between 0 and 1 specifying the threshold to
#' determine dropout values
#' @param type A character specifying the type of values in the expression
#' matrix. Can be 'count' or 'TPM'
#' @param cores integer; number of cores used for paralell computation
#' @param BPPARAM parallel back-end to be used during parallel computation.
#' See \code{\link[BiocParallel]{BiocParallelParam-class}}.
#' @param net.coef matrix; network coefficients. Please provide if you don't
#' want to use ADImpute's network model. Must contain one first column 'O'
#' acconting for the intercept of the model and otherwise be an adjacency matrix
#' with hgnc_symbols in rows and columns. Doesn't have to be squared. See
#' \code{ADImpute::demo_net} for a small example.
#' @param net.implementation character; either 'iteration', for an iterative
#' solution, or 'pseudoinv', to use Moore-Penrose pseudo-inversion as a
#' solution. 'pseudoinv' is not advised for big data.
#' @param tr.length matrix with at least 2 columns: 'hgnc_symbol' and
#' 'transcript_length'
#' @param bulk vector of reference bulk RNA-seq, if available (average across
#' samples)
#' @param ... additional parameters to pass to network-based imputation
#'
#' @details For each gene, a fraction (\code{mask.ratio}) of the quantified
#' expression values are set to zero and imputed according to 3 different
#' methods: scImpute, baseline (average gene expression across all cells) or a
#' network-based method. The imputation error is computed for each of the
#' values in the original dataset that was set to 0, for each method. The
#' method resulting in a lowest imputation error for each gene is chosen.
#'
#' @return
#'  \itemize{
#'     \item if \code{sce} is provided: returns a SingleCellExperiment with the
#'     best performing method per gene stored as row-features. Access via
#'     \code{SingleCellExperiment::int_elementMetadata(sce)$ADImpute$methods}.
#'     \item if \code{sce} is not provided: returns a character with the best
#'     performing method in the training set for each gene
#' }
#'
#' @examples
#' # Normalize demo data
#' norm_data <- NormalizeRPM(ADImpute::demo_data)
#' method_choice <- EvaluateMethods(norm_data, do = c('Baseline','DrImpute'),
#' cores = 2)
#'
#' @seealso \code{\link{ImputeBaseline}},
#' \code{\link{ImputeDrImpute}},
#' \code{\link{ImputeNetwork}}
#'
#' @export
#'
EvaluateMethods <- function(data, sce = NULL, do = c("Baseline", "DrImpute",
        "Network"), write = FALSE, train.ratio = 0.7, train.only = TRUE,
    mask.ratio = 0.1, outdir = getwd(), scale = 1, pseudo.count = 1,
    labels = NULL, cell.clusters = 2, drop_thre = NULL, type = "count",
    cores = BiocParallel::bpworkers(BPPARAM),
    BPPARAM = BiocParallel::SnowParam(type = "SOCK"),
    net.coef = ADImpute::network.coefficients, net.implementation = "iteration",
    tr.length = ADImpute::transcript_length, bulk = NULL, ...) {

    # Check arguments
    if(!is.null(sce)){
        DataCheck_SingleCellExperiment(sce)
        data <- SingleCellExperiment::normcounts(sce)
    }
    Check <- CreateArgCheck(missing = list(data = missing(data)),
        match = list(type = type), acceptable = list(type = c("TPM", "count")),
        null = list(net.coef = is.null(net.coef), data = is.null(data)))
    checkmate::reportAssertions(Check)
    data <- DataCheck_Matrix(data)
    tr.length <- DataCheck_TrLength(tr.length)

    if (sum(tolower(do) %in% c("baseline", "drimpute", "network")) < 2)
        warning(paste0("You are using less than 2 of the default supported ",
            "methods. Make sure you pass at least 2 viable methods ",
            "for performance comparison.\n"))

    if (write) {
        savedir <- getwd()
        dir.create(paste0(outdir, "/training"))
        setwd(paste0(outdir, "/training"))
        on.exit(setwd(savedir))
    }

    train_data <- CreateTrainData(data, train.ratio = train.ratio,
        train.only = train.only, mask = mask.ratio, write = write)

    train_imputed <- Impute(data = train_data$mask, do = do[do != "Ensemble"],
        write = write, outdir = getwd(), scale = scale,
        pseudo.count = pseudo.count, labels = labels,
        cell.clusters = cell.clusters, drop_thre = drop_thre, type = type,
        tr.length = tr.length, bulk = bulk, cores = cores, BPPARAM = BPPARAM,
        net.coef = net.coef, net.implementation = net.implementation, ...)

    # Run optimum choice
    choice <- ChooseMethod(real = round(train_data$train, 2),
            masked = round(train_data$mask, 2),
        imputed = train_imputed, write.to.file = write)

    return(ReturnChoice(sce, choice))
}


#' @title Dropout imputation using different methods
#'
#' @usage Impute(data, sce = NULL, do = 'Ensemble', write = FALSE,
#' outdir = getwd(), method.choice = NULL, scale = 1, pseudo.count = 1,
#' labels = NULL, cell.clusters = 2, drop_thre = NULL, type = 'count',
#' tr.length = ADImpute::transcript_length,
#' cores = BiocParallel::bpworkers(BPPARAM),
#' BPPARAM = BiocParallel::SnowParam(type = "SOCK"),
#' net.coef = ADImpute::network.coefficients, net.implementation = 'iteration',
#' bulk = NULL, true.zero.thr = NULL, prob.mat = NULL, ...)
#'
#' @description \code{Impute} performs dropout imputation on normalized data,
#' based on the choice of imputation methods.
#'
#' @param data matrix; raw counts (genes as rows and samples as columns)
#' @param sce SingleCellExperiment; normalized counts and associated metadata.
#' @param do character; choice of methods to be used for imputation. Currently
#' supported methods are \code{'Baseline'}, \code{'DrImpute'},
#' \code{'Network'}, and \code{'Ensemble'}. Defaults to \code{'Ensemble'}.
#' Not case-sensitive. Can include one or more methods. Non-supported methods
#' will be ignored.
#' @param write logical; write intermediary and imputed objects to files?
#' @param outdir character; path to directory where output files are written.
#' Defaults to working directory
#' @param method.choice character; best performing method in training data for
#' each gene
#' @param scale integer; scaling factor to divide all expression levels by
#' (defaults to 1)
#' @param pseudo.count integer; pseudo-count to be added to expression levels
#' to avoid log(0) (defaults to 1)
#' @param labels character; vector specifying the cell type of each column of
#' \code{data}
#' @param cell.clusters integer; number of cell subpopulations
#' @param drop_thre numeric; between 0 and 1 specifying the threshold to
#' determine dropout values
#' @param type A character specifying the type of values in the expression
#' matrix. Can be 'count' or 'TPM'
#' @param tr.length matrix with at least 2 columns: 'hgnc_symbol' and
#' 'transcript_length'
#' @param cores integer; number of cores used for paralell computation
#' @param BPPARAM parallel back-end to be used during parallel computation.
#' See \code{\link[BiocParallel]{BiocParallelParam-class}}.
#' @param net.coef matrix; network coefficients. Please provide if you don't
#' want to use ADImpute's network model. Must contain one first column 'O'
#' acconting for the intercept of the model and otherwise be an adjacency matrix
#' with hgnc_symbols in rows and columns. Doesn't have to be squared. See
#' \code{ADImpute::demo_net} for a small example.
#' @param net.implementation character; either 'iteration', for an iterative
#' solution, or 'pseudoinv', to use Moore-Penrose pseudo-inversion as a
#' solution. 'pseudoinv' is not advised for big data.
#' @param bulk vector of reference bulk RNA-seq, if available (average across
#' samples)
#' @param true.zero.thr if set to NULL (default), no true zero estimation is
#' performed. Set to numeric value between 0 and 1 for estimation. Value
#' corresponds to the threshold used to determine true zeros: if the probability
#' of dropout is lower than \code{true.zero.thr}, the imputed entries are set
#' to zero.
#' @param prob.mat matrix of the same size as data, filled with the dropout
#' probabilities for each gene in each cell
#' @param ... additional parameters to pass to network-based imputation
#'
#' @return
#' \itemize{
#'     \item if \code{sce} is not set: returns a list of imputation results
#'     (normalized, log-transformed) for all selected methods in \code{do}. If
#'     \code{true.zero.thr} is defined, returns a list of 3 elements: 1) a list,
#'     \code{imputations}, containing the direct imputation results from each
#'     method; 2) a list, \code{zerofiltered}, containing the results of
#'     imputation in \code{imputations} after setting biological zeros back to
#'     zero; 3) a matrix, \code{dropoutprobabilities}, containing the dropout
#'     probability matrix used to set biological zeros.
#'     \item if \code{sce} is set: returns a SingleCellExperiment with new
#'     assays, each corresponding to one of the imputation methods applied. If
#'     \code{true.zero.thr} is defined, the assays will contain the results
#'     after imputation and setting biological zeros back to zero.
#'}
#'
#' @details Values that are 0 in \code{data} are imputed according to the
#' best-performing methods indicated in \code{method.choice}. Currently
#' supported methods are:
#' \itemize{
#'     \item \code{Baseline}: imputation with average expression across all
#' cells in the dataset. See \code{\link{ImputeBaseline}}.
#'     \item Previously published approaches: \code{DrImpute} and \code{SAVER}.
#'     \item \code{Network}: leverages information from a gene regulatory
#' network to predicted expression of genes that are not quantified based on
#' quantified interacting genes, in the same cell. See
#' \code{\link{ImputeNetwork}}.
#'     \item \code{Ensemble}: is based on results on a training subset of the
#' data at hand, indicating which method best predicts the expression of
#' each gene. These results are supplied via \code{method.choice}. Applies
#' the imputation results of the best performing method to the zero entries
#' of each gene.
#' }
#' If \code{'Ensemble'} is included in \code{do}, \code{method.choice} has to
#' be provided (use output from \code{EvaluateMethods()}).
#' \code{Impute} can create a directory \code{imputation} containing the
#' imputation results of all methods in \code{do}.
#' If \code{true.zero.thr} is set, dropout probabilities are computed using
#' scImpute's framework. Expression values with dropout probabilities below
#' \code{true.zero.thr} will be set back to 0 if imputed, as they likely
#' correspond to true biological zeros (genes not expressed in cell) rather than
#' technical dropouts (genes expressed but not captured).
#' If \code{sce} is set, imputed values by the different methods are added as
#' new assays to \code{sce}. Each assay corresponds to one imputation method. If
#' \code{true.zero.thr} is set, only the values after filtering for biological
#' zeros will be added. This is different from the output if \code{sce} is not
#' set, where the original values before filtering and the dropout probability
#' matrix are returned.
#'
#' @examples
#' # Normalize demo data
#' norm_data <- NormalizeRPM(demo_data)
#' # Impute with particular method(s)
#' imputed_data <- Impute(do = 'Network', data = norm_data[,1:10],
#' net.coef = ADImpute::demo_net)
#' imputed_data <- Impute(do = 'Network', data = norm_data[,1:10],
#' net.implementation = 'pseudoinv', net.coef = ADImpute::demo_net)
#'
#' @seealso \code{\link{EvaluateMethods}},
#' \code{\link{ImputeBaseline}},
#' \code{\link{ImputeDrImpute}},
#' \code{\link{ImputeNetwork}},
#' \code{\link{ImputeSAVER}}
#'
#' @export
#'
Impute <- function(data, sce = NULL, do = "Ensemble", write = FALSE,
    outdir = getwd(), method.choice = NULL, scale = 1, pseudo.count = 1,
    labels = NULL, cell.clusters = 2, drop_thre = NULL, type = "count",
    tr.length = ADImpute::transcript_length, cores = BiocParallel::bpworkers(
        BPPARAM), BPPARAM = BiocParallel::SnowParam(type = "SOCK"), net.coef =
        ADImpute::network.coefficients, net.implementation = "iteration",
    bulk = NULL, true.zero.thr = NULL, prob.mat = NULL, ...) {

    # Check arguments
    if(!is.null(sce)){ DataCheck_SingleCellExperiment(sce)
        data <- SingleCellExperiment::normcounts(sce)
        method.choice <- stats::na.omit(
            SingleCellExperiment::int_elementMetadata(sce)$ADImpute$method) }
    check <- CreateArgCheck(missing = list(data = missing(data)),
        match = list(type = type), acceptable = list(type = c("TPM", "count")),
        null = list(net.coef = is.null(net.coef), data = is.null(data)))
    checkmate::reportAssertions(check)
    CheckArguments_Impute(data, method.choice, do, tr.length, labels,
        cell.clusters, true.zero.thr, drop_thre)
    data <- DataCheck_Matrix(data); tr.length <- DataCheck_TrLength(tr.length)
    if (write) { savedir <- getwd(); dir.create(paste0(outdir, "/imputation"))
        setwd(paste0(outdir, "/imputation")); on.exit(setwd(savedir)) }

    if ("ensemble" %in% tolower(do))
        do <- c(do, "Network", unique(method.choice))

    imputed <- list()
    if ("saver" %in% tolower(do)) {
        imputed$SAVER <- ImputeSAVER(data, cores, write = write)
        imputed$SAVER <- log2((imputed$SAVER/scale) + pseudo.count) }
    # Insert call to scImpute and SCRABBLE <-----
    log_masked_norm <- log2((data/scale) + pseudo.count)
    if ("baseline" %in% tolower(do))
        imputed$Baseline <- ImputeBaseline(log_masked_norm, write = write)
    if ("network" %in% tolower(do))
        imputed$Network <- ImputeNetwork(log_masked_norm, net.coef,
            type = net.implementation, cores, BPPARAM, write = write, ...)
    if ("drimpute" %in% tolower(do))
        imputed$DrImpute <- ImputeDrImpute(log_masked_norm, write = write)
    if ("ensemble" %in% tolower(do))
        imputed$Ensemble <- Combine(log_masked_norm, imputed, method.choice,
            write)

    # Estimate true zeros
    if (!is.null(true.zero.thr)) { results <- c(list(imputations = imputed),
            HandleBiologicalZeros(data, imputed, true.zero.thr, cell.clusters,
            labels, type, cores, BPPARAM, tr.length, prob.mat))
        return(ReturnOut(results, sce))
    } else { return(ReturnOut(imputed, sce)) }
}