Nothing
#' Removing Batch Effect from Expression Sets
#'
#' @description
#' Removes batch effects from expression datasets using sva::ComBat (for
#' microarray/TPM data) or sva::ComBat_seq (for RNA-seq count data). Generates
#' PCA plots to visualize data before and after correction.
#'
#' @param eset1 First expression set (matrix or data frame with genes as rows).
#' @param eset2 Second expression set.
#' @param eset3 Optional third expression set. Use `NULL` if not available.
#' @param id_type Type of gene ID in expression sets (e.g., `"ensembl"`,
#' `"symbol"`). Required for count data normalization.
#' @param data_type Type of data: `"array"`, `"count"`, or `"tpm"`.
#' Default is `"array"`.
#' @param cols Color scale for PCA plot. Default is `"normal"`.
#' @param palette Color palette for PCA plot. Default is `"jama"`.
#' @param log2 Whether to perform log2 transformation. Default is `TRUE`.
#' Ignored for count data.
#' @param check_eset Whether to check expression sets for errors.
#' Default is `TRUE`.
#' @param adjust_eset Whether to adjust expression sets by removing problematic
#' features. Default is `TRUE`.
#' @param repel Whether to add repelling labels to PCA plot. Default is `FALSE`.
#' @param path Directory where results should be saved. Default is `NULL`
#' (display only).
#'
#' @return Expression matrix after batch correction.
#'
#' @export
#' @author Dongqiang Zeng
#'
#' @references
#' Zhang Y, et al. ComBat-seq: batch effect adjustment for RNA-seq count data.
#' NAR Genomics and Bioinformatics. 2020;2(3):lqaa078.
#' doi:10.1093/nargab/lqaa078
#'
#' Leek JT, et al. The sva package for removing batch effects and other
#' unwanted variation in high-throughput experiments. Bioinformatics.
#' 2012;28(6):882-883.
#' @examples
#' # Simulate data
#' set.seed(123)
#' sim_eset1 <- matrix(rnorm(100 * 5, mean = 10, sd = 2), 100, 5)
#' sim_eset2 <- matrix(rnorm(100 * 5, mean = 12, sd = 2), 100, 5)
#' rownames(sim_eset1) <- rownames(sim_eset2) <- paste0("Gene", 1:100)
#' colnames(sim_eset1) <- paste0("S1_", 1:5)
#' colnames(sim_eset2) <- paste0("S2_", 1:5)
#'
#' # Run batch correction
#' if (requireNamespace("sva", quietly = TRUE) && requireNamespace("BiocParallel", quietly = TRUE)) {
#' eset_corrected <- remove_batcheffect(sim_eset1, sim_eset2, data_type = "tpm")
#' if (!is.null(eset_corrected)) head(eset_corrected)
#' }
remove_batcheffect <- function(eset1,
eset2,
eset3 = NULL,
id_type = "ensembl",
data_type = c("array", "count", "tpm"),
cols = "normal",
palette = "jama",
log2 = TRUE,
check_eset = TRUE,
adjust_eset = TRUE,
repel = FALSE,
path = NULL) {
if (is.null(eset1) || is.null(eset2)) return(NULL)
data_type <- rlang::arg_match(data_type)
# linetype : "batch"
# Validate data_type
data_type <- rlang::arg_match(data_type)
# Create output folder if path provided
if (!is.null(path)) {
path <- creat_folder(path)
}
# Log2 transformation for non-count data
if (data_type != "count" && log2) {
eset1 <- log2eset(eset1)
eset2 <- log2eset(eset2)
if (!is.null(eset3)) {
eset3 <- log2eset(eset3)
}
}
# Check expression sets
if (check_eset) {
check_eset(eset1)
check_eset(eset2)
if (!is.null(eset3)) check_eset(eset3)
}
# Adjust expression sets
if (adjust_eset) {
eset1 <- eset1[rownames(eset1) %in%
feature_manipulation(eset1, is_matrix = TRUE), ]
eset2 <- eset2[rownames(eset2) %in%
feature_manipulation(eset2, is_matrix = TRUE), ]
if (!is.null(eset3)) {
eset3 <- eset3[rownames(eset3) %in%
feature_manipulation(eset3, is_matrix = TRUE), ]
}
}
# Find common genes
if (is.null(eset3)) {
comgene <- intersect(rownames(eset1), rownames(eset2))
comgene <- comgene[nzchar(comgene) & !is.na(comgene)]
cli::cli_alert_info(
"The two expression matrices share {length(comgene)} features"
)
combined.expr <- cbind(
eset1[comgene, , drop = FALSE],
eset2[comgene, , drop = FALSE]
)
batch <- data.frame(
ID = colnames(combined.expr),
batch = rep(c("eset1", "eset2"), times = c(ncol(eset1), ncol(eset2)))
)
} else {
comgene <- intersect(
intersect(rownames(eset1), rownames(eset2)),
rownames(eset3)
)
comgene <- comgene[nzchar(comgene) & !is.na(comgene)]
cli::cli_alert_info(
"The three expression matrices share {length(comgene)} features"
)
combined.expr <- cbind(
eset1[comgene, , drop = FALSE],
eset2[comgene, , drop = FALSE],
eset3[comgene, , drop = FALSE]
)
batch <- data.frame(
ID = colnames(combined.expr),
batch = rep(c("eset1", "eset2", "eset3"),
times = c(ncol(eset1), ncol(eset2), ncol(eset3))
)
)
}
# Batch correction
rlang::check_installed("sva")
if (data_type != "count") {
cli::cli_alert_info("Processing method: sva::ComBat")
modcombat <- stats::model.matrix(~1, data = batch)
combined.expr.combat <- sva::ComBat(
dat = as.matrix(combined.expr),
batch = batch$batch,
mod = modcombat
)
rlang::check_installed("preprocessCore")
combined.expr.combat <- preprocessCore::normalize.quantiles(
as.matrix(combined.expr.combat),
keep.names = TRUE
)
prefix <- data_type
} else {
cli::cli_alert_info("Processing method: sva::ComBat_seq")
combined.expr.combat <- sva::ComBat_seq(
as.matrix(combined.expr),
batch = batch$batch
)
# Convert corrected counts to TPM
eset2_tpm <- count2tpm(
countMat = combined.expr.combat,
idType = id_type,
source = "local"
)
eset2_tpm <- log2eset(eset2_tpm)
cli::cli_alert_info("Count data processed with ComBat_seq")
prefix <- "count"
}
# Generate PCA plots if FactoMineR is available
if (requireNamespace("FactoMineR", quietly = TRUE) &&
requireNamespace("factoextra", quietly = TRUE)) {
p1 <- iobr_pca(
data = combined.expr,
is.matrix = TRUE,
scale = TRUE,
is.log = TRUE,
pdata = batch,
id_pdata = "ID",
group = "batch",
cols = cols,
palette = palette,
repel = repel,
ncp = 3,
axes = c(1, 2),
addEllipses = TRUE
) + ggplot2::ggtitle(paste("Before correction:", prefix))
p2 <- iobr_pca(
data = combined.expr.combat,
is.matrix = TRUE,
scale = TRUE,
is.log = TRUE,
pdata = batch,
id_pdata = "ID",
group = "batch",
cols = cols,
palette = palette,
repel = repel,
ncp = 3,
axes = c(1, 2),
addEllipses = TRUE
) + ggplot2::ggtitle(paste("After correction:", prefix))
if (data_type == "count") {
p3 <- iobr_pca(
data = eset2_tpm,
is.matrix = TRUE,
scale = TRUE,
is.log = TRUE,
pdata = batch,
id_pdata = "ID",
group = "batch",
cols = cols,
palette = palette,
repel = repel,
ncp = 3,
axes = c(1, 2),
addEllipses = TRUE
) + ggplot2::ggtitle("After correction: count2TPM")
p <- patchwork::wrap_plots(p1, p2, p3, nrow = 1)
} else {
p <- patchwork::wrap_plots(p1, p2, nrow = 1)
}
if (interactive()) print(p)
# Save plot if path provided
if (!is.null(path)) {
num_plots <- if (data_type == "count") 3 else 2
width <- num_plots * 5
ggplot2::ggsave(
filename = paste0("0-PCA-of-", num_plots, "-eset.pdf"),
plot = p,
width = width,
height = 5,
path = path$folder_name
)
}
}
invisible(combined.expr.combat)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.