#' pcRegression
#'
#' @description pcRegression does a linear model fit of principal components
#' and a batch (categorical) variable
#' @param pca.data a list as created by 'prcomp', pcRegression needs
#' \itemize{
#' \item \code{x} -
#' the principal components (PCs, correctly: the rotated data) and
#' \item \code{sdev} - the standard deviations of the PCs)
#' }
#' @param batch vector with the batch covariate (for each cell)
#' @param n_top the number of PCs to consider at maximum
#' @param tol truncation threshold for significance level, default: 1e-16
#' @return List summarising principal component regression
#' \itemize{
#' \item \code{maxVar} - the variance explained by principal component(s)
#' that correlate(s) most with the batch effect
#' \item \code{PmaxVar} - p-value (returned by linear model) for the
#' respective principal components (related to \code{maxVar})
#' \item \code{pcNfrac} - fraction of significant PCs among the \code{n_top} PCs
#' \item \code{pcRegscale} - 'scaled PC regression', i.e. total variance of PCs which correlate significantly with batch covariate (FDR<0.05) scaled by the total variance of \code{n_top} PCs
#' \item \code{maxCorr} - maximal correlation of \code{n_top} PCs with batch covariate
#' \item \code{maxR2} - maximal coefficient of determination of \code{n_top} PCs with batch covariate
#' \item \code{msigPC} - scaled index of the smallest PC that correlates significantly with batch covariate (FDR<0.05), i.e. \code{msigPC=1} if PC_1 is significantly correlated with the batch covariate and \code{msigPC=0} if none of the \code{n_top} PCs is significantly correlated
#' \item \code{maxsigPC} - similar to \code{msigPC}, scaled index of the PC with maximal correlation of \code{n_top} PCs with batch covariate
#' \item \code{R2Var} - sum over Var(PC_i)*r2(PC_i and batch) for all i
#' \item \code{ExplainedVar} - explained variance for each PC
#' \item \code{r2} - detailed results of correlation (R-Square) analysis
#' }
#' @examples
#' testdata <- create_testset_multibatch(n.genes=1000, n.batch=3, plattform='any')
#' pca.data <- prcomp(testdata$data, center=TRUE)
#' pc.reg.result <- pcRegression(pca.data, testdata$batch)
#' @importFrom stats t.test lm aov p.adjust
#' @export
pcRegression <- function(pca.data, batch,n_top=20, tol=1e-16){
batch.levels <- unique(batch)
#make sure you do not try to assess more PCs than actually computed
pca_rank = ncol(pca.data$x)
max_comps <- min(pca_rank, n_top)
if (length(pca.data$sdev) > pca_rank) {
pca.data$sdev = pca.data$sdev[1:pca_rank]
}
if (length(batch.levels) == 2) {
#r2.batch.raw <- r2.batch
# for-loop replaced by correlate.fun and apply
r2.batch <- apply(pca.data$x, 2, correlate.fun_two, batch, batch.levels)
r2.batch <- t(r2.batch)
colnames(r2.batch) <- c('R.squared', 'p.value.lm', 'p.value.t.test')
r2.batch[r2.batch[,2] < tol, 2] <- tol
r2.batch[r2.batch[,3] < tol, 3] <- tol
} else {
#r2.batch.raw <- r2.batch
r2.batch <- apply(pca.data$x, 2, correlate.fun_gen, batch)
r2.batch <- t(r2.batch)
colnames(r2.batch) <- c('R.squared', 'p.value.lm', 'p.value.F.test')
# for-loop replaced by correlate.fun and apply
#for (k in 1:dim(r2.batch)[1]){
# a <- lm(pca.data$x[,k] ~ batch)
# r2.batch[k,1] <- summary(a)$r.squared #coefficient of determination
# r2.batch[k,2] <- summary(a)$coefficients['batch',4] #p-value (significance level)
#}
r2.batch[r2.batch[,2] < tol, 2] <- tol
r2.batch[r2.batch[,3] < tol, 3] <- tol
}
argmin <- which(r2.batch[, 2] == min(r2.batch[, 2]))
normal <- sum(pca.data$sdev^2)
var <- round((pca.data$sdev)^2 / normal * 100,1)
batch.var <- sum(r2.batch[,1]*var)/100
setsignif <- p.adjust(r2.batch[1:max_comps,2], method = 'BH') < 0.05
pcCorr <- sqrt(r2.batch[1:max_comps, 1])
result <- list()
result$maxVar <- var[argmin]
result$PmaxVar <- r2.batch[argmin,2]
result$pcNfrac <- mean(setsignif)
result$pcRegscale <- sum(var[1:max_comps][setsignif])/sum(var[1:max_comps])
result$maxCorr <- max(pcCorr)
result$maxR2 <- max(r2.batch[1:max_comps, 1])
result$msigPC <- 1 - (min(c(which(setsignif), max_comps + 1)) - 1) / max_comps
result$maxsigPC <- 1 - (min(c(which(pcCorr == max(pcCorr[setsignif])), max_comps + 1)) - 1) / max_comps
result$R2Var <- batch.var
result$ExplainedVar <- var
result$r2 <- r2.batch
result
}
#significance test for pcRegression (two levels)
correlate.fun_two <- function(rot.data, batch, batch.levels) {
#rot.data: some vector (numeric entries)
#batch: some vector (categoric entries)
a <- lm(rot.data ~ batch)
result <- numeric(2)
result[1] <- summary(a)$r.squared #coefficient of determination
result[2] <- summary(a)$coefficients[2,4] #p-value (significance level)
t.test.result <- t.test(rot.data[batch == batch.levels[1]],
rot.data[batch == batch.levels[2]], paired = FALSE)
result[3] <- t.test.result$p.value
result
}
#significance test for pcRegression (more than two levels)
correlate.fun_gen <- function(rot.data, batch){
#rot.data: some vector (numeric covariate)
#batch: some vector (categoric covariate)
a <- lm(rot.data ~ batch)
result <- numeric(2)
result[1] <- summary(a)$r.squared #coefficient of determination
F.test.result <- aov(rot.data ~ batch)
F.test.summary <- summary(F.test.result)
result[2] <- summary(a)$coefficients[2,4] #p-value (significance level)
result[3] <- F.test.summary[[1]]$'Pr(>F)'[1] #p-value of the one-way anova test
result
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.