R/batchQC.R

Defines functions setShinyInputSVAr getShinyInputSVAr setShinyInputSVAf getShinyInputSVAf setShinyInputSVA getShinyInputSVA setShinyInputCombat getShinyInputCombat setShinyInputOrig getShinyInputOrig setShinyInput getShinyInput Beta.NA bprior aprior combatPlot batchQC batchQC_analyze

Documented in batchQC batchQC_analyze combatPlot getShinyInput getShinyInputCombat getShinyInputOrig getShinyInputSVA getShinyInputSVAf getShinyInputSVAr setShinyInput setShinyInputCombat setShinyInputOrig setShinyInputSVA setShinyInputSVAf setShinyInputSVAr

require(sva)
require(limma)
require(pander)
require(stats)
require(graphics)

#' Batch and Condition indicator for signature data captured when 
#' activating different growth pathway genes in human mammary epithelial cells. 
#'
#' This data consists of three batches and ten different conditions 
#' corresponding to control and nine different pathways
#'
#' @name example_batchqc_data
#' @format A data frame with 89 rows and 2 variables:
#' \describe{
#'     \item{V1}{Batch Indicator}
#'     \item{V2}{Condition (Pathway) Indicator}
#' }
#' @source GEO accession: GSE73628
#' @return Batch indicator object
"batch_indicator"

#' Signature data captured when activating different growth pathway genes in 
#' human mammary epithelial cells 
#'
#' This data consists of three batches and ten different conditions 
#' corresponding to control and nine different pathways
#'
#' @name example_batchqc_data
#' @format A data frame with 18052 rows and 89 variables:
#' \describe{
#'     \item{Columns1-89}{Control and Pathway activated samples}
#'     \item{rows1-18052}{Genes 1-18052}
#' }
#' @source GEO accession: GSE73628
#' @return Signature data 
"signature_data"

#' Batch and Condition indicator for protein expression data
#'
#' This data consists of two batches and two conditions 
#' corresponding to case and control for the protein expression data
#'
#' @name protein_example_data
#' @format A data frame with 24 rows and 4 variables:
#' \describe{
#'     \item{Arrayname}{Array Name}
#'     \item{samplename}{Sample Name}
#'     \item{Batch}{Batch Indicator}
#'     \item{category}{Condition (Case vs Control) Indicator}
#' }
#' @return Protein data sample info
"protein_sample_info"

#' Protein data with 39 protein expression levels
#'
#' This data consists of two batches and two conditions 
#' corresponding to case and control
#'
#' @name protein_example_data
#' @format A data frame with 39 rows and 24 variables:
#' \describe{
#'     \item{Columns1-24}{Control and Case samples}
#'     \item{rows1-39}{Proteins 1-39}
#' }
#' @return Protein data 
"protein_data"

#' @importFrom methods setOldClass
setOldClass("prcomp")
################################################################################
#' The BatchQC output class to output BatchQC results
#'
#' Contains all currently-supported BatchQC output data classes: 
#' 
#' slots:
#' \describe{
#'     \item{batchqc_ev}{a single object of class list}
#'     \item{pca}{a single object of S3 class prcomp}
#' }
#' 
#' @name BatchQCout-class
#' @rdname BatchQCout-class
#' @exportClass BatchQCout
setClass(Class="BatchQCout", 
    representation=representation(
        batchqc_ev="list",
        pca="prcomp"
    ),
    prototype=prototype(batchqc_ev=NULL, pca=NULL)
)
###############################################################################

#' Checks for presence of batch effect and reports whether the batch 
#' needs to be adjusted
#' 
#' @param data.matrix Given data or simulated data from rnaseq_sim()
#' @param batch Batch covariate 
#' @param mod Model matrix for outcome of interest and other covariates 
#' besides batch
#' @return pca Principal Components Analysis object of the data
#' @export
#' @examples
#' nbatch <- 3
#' ncond <- 2
#' npercond <- 10
#' data.matrix <- rnaseq_sim(ngenes=50, nbatch=nbatch, ncond=ncond, npercond=
#'     npercond, basemean=10000, ggstep=50, bbstep=2000, ccstep=800, 
#'     basedisp=100, bdispstep=-10, swvar=1000, seed=1234)
#' batch <- rep(1:nbatch, each=ncond*npercond)
#' condition <- rep(rep(1:ncond, each=npercond), nbatch)
#' pdata <- data.frame(batch, condition)
#' modmatrix = model.matrix(~as.factor(condition), data=pdata)
#' batchQC_analyze(data.matrix, batch, mod=modmatrix)
batchQC_analyze <- function(data.matrix, batch, mod = NULL) {
    batchqc_heatmap(data.matrix, batch = batch, mod = mod)
    pca <- batchqc_pca(data.matrix, batch = batch, mod = mod)
    batchtest(pca, batch = batch, mod = mod)
    return(pca)
}

#' Checks for presence of batch effect and creates a html report 
#' with information including whether the batch needs to be adjusted
#' 
#' @param dat Given data or simulated data from rnaseq_sim()
#' @param batch Batch covariate 
#' @param condition Covariates or conditions of interest besides batch
#' @param report_file Output report file name 
#' @param report_dir Output report directory path 
#' @param report_option_binary 9 bits Binary String representing 
#' the plots to display and hide in the report 
#' @param view_report when TRUE, opens the report in a browser 
#' @param interactive when TRUE, opens the interactive shinyApp 
#' @param batchqc_output when TRUE, creates BatchQCout object in 
#' batchqc_output.rda R object file 
#' @param log2cpm_transform when TRUE, transforms the data using log2CPM -
#' log2 Counts Per Million transformation function 
#' @return outputfile Report file generated by batchQC
#' @import rmarkdown knitr pander ggvis heatmaply reshape2 limma
#' @importFrom grDevices colorRampPalette rainbow
#' @importFrom graphics abline axis hist image layout legend lines 
#' mtext par plot plot.new points rect segments text title
#' @importFrom stats IQR as.dendrogram as.dist cor density 
#' dist dnorm hclust ks.test lm median model.matrix 
#' order.dendrogram pf prcomp qqline qqnorm qqplot quantile
#' reorder rgamma rnbinom rnorm sd var
#' @importFrom utils browseURL
#' @importFrom shiny runApp
#' @importFrom methods new
#' @importFrom Matrix nearPD
#' @export
#' @examples
#' nbatch <- 3
#' ncond <- 2
#' npercond <- 10
#' data.matrix <- rnaseq_sim(ngenes=50, nbatch=nbatch, ncond=ncond, npercond=
#'     npercond, basemean=10000, ggstep=50, bbstep=2000, ccstep=800, 
#'     basedisp=100, bdispstep=-10, swvar=1000, seed=1234)
#' batch <- rep(1:nbatch, each=ncond*npercond)
#' condition <- rep(rep(1:ncond, each=npercond), nbatch)
#' batchQC(data.matrix, batch=batch, condition=condition, view_report=FALSE, 
#'     interactive=FALSE)
batchQC <- function(dat, batch, condition = NULL, 
    report_file = "batchqc_report.html", report_dir = ".", 
    report_option_binary = "111111111", view_report = FALSE, 
    interactive = TRUE, batchqc_output=FALSE, log2cpm_transform=FALSE) {
    if (is.null(condition))  condition <- rep(1,ncol(dat))
    if (is.null(batch))  batch <- rep(1,ncol(dat))
    pdata <- data.frame(batch, condition)
    ncond <- nlevels(as.factor(condition))
    if (ncond <= 1)  {
        mod = matrix(rep(1, ncol(dat)), ncol = 1)
    } else  {
        mod = model.matrix(~as.factor(condition), data = pdata)
    }
    if (report_dir == ".") {
        report_dir = getwd()
    }
    dat <- as.matrix(dat)
    if (is.null(colnames(dat)))  {
        colnames(dat) <- 1:ncol(dat)
    }
    dat <- batchQC_filter_genes(dat, batch, condition)
    report_option_vector <- unlist(strsplit(as.character(report_option_binary), 
        ""))
    if (log2cpm_transform)  {
        lcounts <- log2CPM(dat)$y
    } else  {
        lcounts <- dat
    }
    shinyInput <- list(data = dat, batch = batch, condition = condition, 
        report_dir = report_dir, report_option_vector = report_option_vector,
        lcounts = lcounts, log2cpm_transform=log2cpm_transform)
    setShinyInput(shinyInput)
    batchqc_ev <- NULL
    pca <- NULL
    rmdfile <- system.file("reports/batchqc_report.Rmd", package = "BatchQC")
    batchqc_html <- system.file("reports/batchQC.html", package = "BatchQC")
    # rmarkdown::draft('batchqc_report.Rmd', template =
    # 'batchqc', package = 'BatchQC')
    static_lib_dir <- system.file("reports/libs", package = "BatchQC")
    file.copy(static_lib_dir, report_dir, recursive = TRUE)
    file.copy(rmdfile, report_dir, overwrite = TRUE)
    file.copy(batchqc_html, report_dir, overwrite = TRUE)
    rmdfile_copy <- file.path(report_dir, "batchqc_report.Rmd")
    outputfile <- rmarkdown::render(rmdfile_copy, output_file = report_file, 
        output_dir=report_dir, knit_root_dir=report_dir, clean=FALSE)
    shinyInput <- getShinyInput()
    setShinyInputOrig(shinyInput)
    setShinyInputCombat(NULL)
    setShinyInputSVAf(NULL)
    setShinyInputSVAr(NULL)
    setShinyInputSVA(NULL)
    if (batchqc_output)  {
        if (is.null(batchqc_ev)) batchqc_ev <- list()
        if (is.null(pca)) pca <- prcomp(0)
        batchQCout <- new ("BatchQCout", batchqc_ev=batchqc_ev, pca=pca)
        outfile <- file.path(report_dir, "batchqc_output.rda")
        save(batchQCout, file=outfile)
    }
    if (view_report) {
        browseURL(outputfile)
    }
    if (interactive) {
        appDir <- system.file("shiny", "BatchQC", package = "BatchQC")
        if (appDir == "") {
            stop("Could not find shiny directory. Try re-installing BatchQC.", 
                call. = FALSE)
        }
        shiny::runApp(appDir, display.mode = "normal")
    }
    return(outputfile)
}

#' Adjust for batch effects using an empirical Bayes framework
#' ComBat allows users to adjust for batch effects in datasets where the 
#' batch covariate is known, using methodology described in Johnson et al. 2007.
#' It uses either parametric or non-parametric empirical Bayes frameworks for 
#' adjusting data for batch effects. Users are returned an expression matrix 
#' that has been corrected for batch effects. The input
#' data are assumed to be cleaned and normalized before batch effect removal. 
#' 
#' @param dat Genomic measure matrix (dimensions probe x sample) - for example,
#' expression matrix
#' @param batch {Batch covariate (only one batch allowed)}
#' @param mod Model matrix for outcome of interest and other covariates 
#' besides batch
#' @param par.prior (Optional) TRUE indicates parametric adjustments will be 
#' used, FALSE indicates non-parametric adjustments will be used
#' @param prior.plots (Optional)TRUE give prior plots with black as a kernel 
#' estimate of the empirical batch effect density and red as the parametric
#' @return data A probe x sample genomic measure matrix, adjusted for 
#' batch effects.
#' @export
#' @examples
#' nbatch <- 3
#' ncond <- 2
#' npercond <- 10
#' data.matrix <- rnaseq_sim(ngenes=50, nbatch=nbatch, ncond=ncond, npercond=
#'     npercond, basemean=10000, ggstep=50, bbstep=2000, ccstep=800, 
#'     basedisp=100, bdispstep=-10, swvar=1000, seed=1234)
#' batch <- rep(1:nbatch, each=ncond*npercond)
#' condition <- rep(rep(1:ncond, each=npercond), nbatch)
#' pdata <- data.frame(batch, condition)
#' mod = model.matrix(~as.factor(condition), data = pdata)
#' combatPlot(data.matrix, batch, mod=mod)
combatPlot <- function(dat, batch, mod = NULL, par.prior = TRUE, 
    prior.plots = TRUE) {
    # make batch a factor and make a set of indicators for batch
    if (length(dim(batch)) > 1) 
        {
            warning("This version of ComBat only allows one batch variable")
            return(NULL)
        }  ## to be updated soon!
    batch <- as.factor(batch)
    if (nlevels(batch) <= 1) {
        warning("There is no batch")
        return(NULL)
    }
    batchmod <- model.matrix(~-1 + batch)
    cat("Found", nlevels(batch), "batches\n")
    
    # A few other characteristics on the batches
    n.batch <- nlevels(batch)
    batches <- list()
    for (i in 1:n.batch) {
        batches[[i]] <- which(batch == levels(batch)[i])
    }  # list of samples in each batch  
    n.batches <- sapply(batches, length)
    n.array <- sum(n.batches)
    
    # combine batch variable and covariates
    design <- cbind(batchmod, mod)
    
    # check for intercept in covariates, and drop if present
    check <- apply(design, 2, function(x) all(x == 1))
    design <- as.matrix(design[, !check])
    
    # Number of covariates or covariate levels
    cat("Adjusting for", ncol(design) - ncol(batchmod), 
        "covariate(s) or covariate level(s)\n")
    
    # Check if the design is confounded
    if (qr(design)$rank < ncol(design)) {
        # if(ncol(design)<=(n.batch)){stop('Batch variables are
        # redundant!  Remove one or more of the batch variables so
        # they are no longer confounded')}
        if (ncol(design) == (n.batch + 1)) {
            stop(paste("The covariate is confounded with batch! Remove the ", 
                "covariate and rerun ComBat", sep = ""))
        }
        if (ncol(design) > (n.batch + 1)) {
            if ((qr(design[, -c(1:n.batch)])$rank < ncol(design[, 
                -c(1:n.batch)]))) {
                stop(paste("The covariates are confounded! Please remove one ", 
                    "or more of the covariates so the design is not confounded",
                    sep = ""))
            } else {
                stop(paste("At least one covariate is confounded with batch! ", 
                    "Please remove confounded covariates and rerun ComBat", 
                    sep = ""))
            }
        }
    }
    
    ## Check for missing values
    NAs = any(is.na(dat))
    if (NAs) {
        cat(c("Found", sum(is.na(dat)), "Missing Data Values\n"), 
            sep = " ")
    }
    # print(dat[1:2,]) Standardize Data across genes
    cat("Standardizing Data across genes\n")
    if (!NAs) {
        B.hat <- solve(t(design) %*% design) %*% t(design) %*% 
            t(as.matrix(dat))
    } else {
        B.hat = apply(dat, 1, Beta.NA, design)
    }  #Standarization Model
    grand.mean <- t(n.batches/n.array) %*% B.hat[1:n.batch, ]
    if (!NAs) {
        var.pooled <- ((dat - t(design %*% B.hat))^2) %*% rep(1/n.array, 
            n.array)
    } else {
        var.pooled <- apply(dat - t(design %*% B.hat), 1, var, 
            na.rm = TRUE)
    }
    
    stand.mean <- t(grand.mean) %*% t(rep(1, n.array))
    if (!is.null(design)) {
        tmp <- design
        tmp[, c(1:n.batch)] <- 0
        stand.mean <- stand.mean + t(tmp %*% B.hat)
    }
    s.data <- (dat - stand.mean)/(sqrt(var.pooled) %*% t(rep(1, 
        n.array)))
    
    ## Get regression batch effect parameters
    cat("Fitting L/S model and finding priors\n")
    batch.design <- design[, 1:n.batch]
    if (!NAs) {
        gamma.hat <- solve(t(batch.design) %*% batch.design) %*% 
            t(batch.design) %*% t(as.matrix(s.data))
    } else {
        gamma.hat <- apply(s.data, 1, Beta.NA, batch.design)
    }
    
    shinyInput <- getShinyInput()
    if (is.null(shinyInput)) {
        shinyInput <- list(data = dat, batch = batch)
    }
    shinyInput <- c(shinyInput, list(gamma.hat = gamma.hat))
    delta.hat <- NULL
    for (i in batches) {
        delta.hat <- rbind(delta.hat, apply(s.data[, i], 1, var, 
            na.rm = TRUE))
    }
    shinyInput <- c(shinyInput, list(delta.hat = delta.hat))
    
    ## Find Priors
    gamma.bar <- apply(gamma.hat, 1, mean)
    shinyInput <- c(shinyInput, list(gamma.bar = gamma.bar))
    t2 <- apply(gamma.hat, 1, var)
    shinyInput <- c(shinyInput, list(t2 = t2))
    a.prior <- apply(delta.hat, 1, aprior)
    shinyInput <- c(shinyInput, list(a.prior = a.prior))
    b.prior <- apply(delta.hat, 1, bprior)
    shinyInput <- c(shinyInput, list(b.prior = b.prior))
    setShinyInput(shinyInput)
    
    
    ## Plot empirical and parametric priors
    
    if (prior.plots & par.prior) {
        # par(mfrow=c(2,2))
        tmp <- density(gamma.hat[1, ])
        xx <- seq(min(tmp$x), max(tmp$x), length = 100)
        tmp1 <- dnorm(xx, gamma.bar[1], sqrt(t2[1]))
        plot(tmp, type = "l", main = "Density Plot", 
            ylim = c(0, max(tmp$y, tmp1)))
        lines(xx, tmp1, col = 2)
        qqnorm(gamma.hat[1, ])
        qqline(gamma.hat[1, ], col = 2)
        
        tmp <- density(delta.hat[1, ])
        invgam <- 1/rgamma(ncol(delta.hat), a.prior[1], b.prior[1])
        tmp1 <- density(invgam)
        plot(tmp, type = "l", main = "Density Plot", ylim = c(0, 
            max(tmp$y, tmp1$y)))
        lines(tmp1, col = 2)
        qqplot(delta.hat[1, ], invgam, xlab = "Sample Quantiles", 
            ylab = "Theoretical Quantiles")
        lines(c(0, max(invgam)), c(0, max(invgam)), col = 2)
        title("Q-Q Plot")
    }
    
    kstest <- ks.test(gamma.hat[1, ], "pnorm", gamma.bar[1], 
        sqrt(t2[1]))
    # two-sided, exact
    
    return(kstest)
    
}

# Following four find empirical hyper-prior values
aprior <- function(gamma.hat) {
    m = mean(gamma.hat)
    s2 = var(gamma.hat)
    (2 * s2 + m^2)/s2
}
bprior <- function(gamma.hat) {
    m = mean(gamma.hat)
    s2 = var(gamma.hat)
    (m * s2 + m^3)/s2
}
Beta.NA = function(y, X) {
    des = X[!is.na(y), ]
    y1 = y[!is.na(y)]
    B <- solve(t(des) %*% des) %*% t(des) %*% y1
    B
}


#' Getter function to get the shinyInput option
#' @return shinyInput option
#' @export
#' @examples
#' getShinyInput()
getShinyInput <- function() {
    shinyInput <- getOption("batchqc.shinyInput")
    return(shinyInput)
}
#' Setter function to set the shinyInput option
#' @param x shinyInput option
#' @return shinyInput option
#' @export
#' @examples
#' setShinyInput(NULL)
setShinyInput <- function(x) {
    options(batchqc.shinyInput = x)
}

#' Getter function to get the shinyInputOrig option
#' @return shinyInputOrig option
#' @export
#' @examples
#' getShinyInputOrig()
getShinyInputOrig <- function() {
    shinyInputOrig <- getOption("batchqc.shinyInputOrig")
    return(shinyInputOrig)
}
#' Setter function to set the shinyInputOrig option
#' @param x shinyInputOrig option
#' @return shinyInputOrig option
#' @export
#' @examples
#' setShinyInputOrig(NULL)
setShinyInputOrig <- function(x) {
    options(batchqc.shinyInputOrig = x)
}

#' Getter function to get the shinyInputCombat option
#' @return shinyInputCombat option
#' @export
#' @examples
#' getShinyInputCombat()
getShinyInputCombat <- function() {
    shinyInputCombat <- getOption("batchqc.shinyInputCombat")
    return(shinyInputCombat)
}
#' Setter function to set the shinyInputCombat option
#' @param x shinyInputCombat option
#' @return shinyInputCombat option
#' @export
#' @examples
#' setShinyInputCombat(NULL)
setShinyInputCombat <- function(x) {
    options(batchqc.shinyInputCombat = x)
}
#' Getter function to get the shinyInputSVA option
#' @return shinyInputSVA option
#' @export
#' @examples
#' getShinyInputSVA()
getShinyInputSVA <- function() {
    shinyInputSVA <- getOption("batchqc.shinyInputSVA")
    return(shinyInputSVA)
}
#' Setter function to set the shinyInputSVA option
#' @param x shinyInputSVA option
#' @return shinyInputSVA option
#' @export
#' @examples
#' setShinyInputSVA(NULL)
setShinyInputSVA <- function(x) {
    options(batchqc.shinyInputSVA = x)
}
#' Getter function to get the shinyInputSVAf option
#' @return shinyInputSVAf option
#' @export
#' @examples
#' getShinyInputSVAf()
getShinyInputSVAf <- function() {
    shinyInputSVAf <- getOption("batchqc.shinyInputSVAf")
    return(shinyInputSVAf)
}
#' Setter function to set the shinyInputSVAf option
#' @param x shinyInputSVAf option
#' @return shinyInputSVAf option
#' @export
#' @examples
#' setShinyInputSVAf(NULL)
setShinyInputSVAf <- function(x) {
    options(batchqc.shinyInputSVAf = x)
}
#' Getter function to get the shinyInputSVAr option
#' @return shinyInputSVAr option
#' @export
#' @examples
#' getShinyInputSVAr()
getShinyInputSVAr <- function() {
    shinyInputSVAr <- getOption("batchqc.shinyInputSVAr")
    return(shinyInputSVAr)
}
#' Setter function to set the shinyInputSVAr option
#' @param x shinyInputSVAr option
#' @return shinyInputSVAr option
#' @export
#' @examples
#' setShinyInputSVAr(NULL)
setShinyInputSVAr <- function(x) {
    options(batchqc.shinyInputSVAr = x)
} 

Try the BatchQC package in your browser

Any scripts or data that you put into this service are public.

BatchQC documentation built on Nov. 8, 2020, 8:30 p.m.