#' sanity check for datasets
#'
#' Will stop and return error message if rows have constant variance,
#' or if they have differing numbers of rows, or contain NA values.
#'
#' @param ctrl matrix of control data
#' @param expm matrix of experiment data
#'
#' @return error message if error, otherwise silent
check_datasets <- function(ctrl, expm){
if (dim(ctrl)[1] != dim(expm)[1]){
stop('Control expression data must have equal number of genes as experiment expression data!')
}
if (any(is.na(ctrl)) || any(is.na(expm))){
stop('Control expression data and experiment expression data have to be real numbers. NA was found!')
}
constantThreshold <- 1e-5
ctrlConstantGenes <- diag(var(t(ctrl))) < constantThreshold
expmConstantGenes <- diag(var(t(expm))) < constantThreshold
if (any(ctrlConstantGenes)){
errMes <- sprintf('%s row(s) in control expression data are constant. Consider Removing the row(s).',
paste(as.character(which(ctrlConstantGenes)), collapse=','))
stop(errMes, call. = FALSE)
} else if(any(expmConstantGenes)){
errMes <- sprintf('%s row(s) in experiment expression data are constant. Consider Removing the row(s).',
paste(as.character(which(expmConstantGenes)), collapse=','))
stop(errMes, call. = FALSE)
}
}
#' Characteristic Direction
#'
#' Calculates the characteristic direction for a phenotypic dataset
#'
#' @param ctrl matrix of control data
#' @param expm matrix of experiment (perturbed) data
#' @param samples vector of drug names
#' @param r numeric between 0 and 1. Regularised term, a parameter that smooths
#' the covariance matrix and reduces potential noise in the dataset. The
#' default value is 0, no regularisation.
#'
#' @return vector of n-components representing the characteristic direction of
#' the dataset. \code{n} equals the number of features in the dataset.
#' b is also a matrix object, and is sorted by its components' absolute
#' values in descending order.
#' @export
chdir <- function(ctrl, expm, samples, r = 1){
# check the datasets for constant variance, differing numbers of rows
# or presence of NA values
check_datasets(ctrl, expm)
# place control gene expression data and experiment gene expression data into
# one matrix
combinedData <- cbind(ctrl, expm)
# get the number of samples, namely, the total number of replicates in control
# and experiment.
samplesCount <- dim(combinedData)[2]
# the number of output components desired from PCA. We only want to calculate
# the chdir in a subspace that capture most variance in order to save computation
# workload. The number is set 20 because considering the number of genes usually
# present in an expression matrix 20 components would capture most of the variance.
componentsCount <- min(c(samplesCount - 1, 20))
# use the nipals PCA algorithm to calculate R, V, and pcvars. nipals algorithm
# has better performance than the algorithm used by R's builtin PCA function.
# R are scores and V are coefficients or loadings. pcvars are the variances
# captured by each component
pcaRes <- nipals(t(combinedData), componentsCount, 1e5)
R <- pcaRes$T # PCA scores
V <- pcaRes$P # PCA loadings
pcvars <- pcaRes$pcvar
# we only want components that capture 95% of the total variance or a little above.
# cutIdx is the index of the component, within which the variance is just equal
# to or a little greater than 95% of the total.
cutIdx <- which(cumsum(pcvars) > 0.95)
if (length(cutIdx) == 0){
cutIdx <- componentsCount
} else {
cutIdx <- cutIdx[1]
}
# slice R and V to only that number of components.
R <- R[, 1:cutIdx]
V <- V[, 1:cutIdx]
# the difference between experiment mean and control mean.
meanvec <- rowMeans(expm) - rowMeans(ctrl)
# shruken covariance matrix as denominator for LDA
shrunkMats <- shrink_mat(R, samplesCount, r)
# The LDA formula.
# V%*%solve(shrunkMats)%*%t(V) transforms the covariance matrix from the subspace to full space.
b <- V %*% solve(shrunkMats) %*% t(V) %*% meanvec
# normlize b to unit vector
b <- b * as.vector(sqrt(1 / t(b) %*% b))
# sort b to by its components' absolute value in decreasing order and get the
# sort index
sortRes <- sort(abs(b), decreasing = TRUE, index.return = TRUE)
# sort b by the sort index
bSorted <- as.matrix(b[sortRes$ix])
# sort genes by the sort index
samplesSorted <- samples[sortRes$ix]
# assign genesSorted as the row names of bSorted
rownames(bSorted) <- samplesSorted
return(bSorted)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.