#trim bulk and single-cell data to contain the same genes
trimData<-function(Signature,bulkData){
Genes<-intersect(rownames(Signature),names(bulkData))
B<-bulkData[Genes]
S<-Signature[Genes,,drop=FALSE]
return(list("sig"=S,"bulk"=B))
}
#' Solver using OLS, constrained such that cell type numbers>0
#'
#' @param S Signature matrix
#' @param B One column of the bulk data matrix, so one sample
#' @param verbose Whether to produce an output on the console.
#'
#' @return A vector with the cell type proportions for the sample
#'
solveOLS <- function(S, B, verbose = FALSE) {
solution <- solveOLSInternal(S, B, verbose)
return(solution / sum(solution))
}
#' Solver using OLS, constrained such that cell type numbers>0
#'
#' Returns absolute numbers, not proportions
#'
#' @param S Signature matrix
#' @param B One column of the bulk data matrix, so one sample
#' @param verbose Whether to produce an output on the console.
#'
#' @return A vector with the cell type numbers for the sample
#'
solveOLSInternal <- function(S, B, verbose = FALSE) {
D <- t(S) %*% S
d <- t(S) %*% B
A <- cbind(diag(dim(S)[2]))
bzero <- c(rep(0, dim(S)[2]))
sc <- norm(D, "2")
solution <- tryCatch(
{
quadprog::solve.QP(D / sc, d / sc, A, bzero)$solution
},
error = function(cond) {
message(
"Error solving the quadratic programming problem. Your dataset might be too small. Run ",
"with verbose=TRUE to get the full error message."
)
if (verbose) {
stop(cond)
} else {
stop()
}
},
warning = function(cond) {
warning(cond)
}
)
names(solution) <- colnames(S)
return(solution)
}
#' Solver using WLS with weights dampened by a certain dampening constant
#'
#' @param S Signature matrix
#' @param B One column of the bulk data matrix, so one sample
#' @param verbose Whether to produce an output on the console.
#'
#' @return A vector with the cell type numbers for the sample
#'
solveDampenedWLS <- function(S, B, verbose = FALSE) {
# first solve OLS, use this solution to find a starting point for the weights
solution <- solveOLSInternal(S, B, verbose)
# now use dampened WLS, iterate weights until convergence
iterations <- 0
changes <- c()
# find dampening constant for weights using cross-validation
j <- findDampeningConstant(S, B, solution, verbose)
change <- 1
while (change > .01 & iterations < 1000) {
newsolution <- solveDampenedWLSj(S, B, solution, j, verbose)
# decrease step size for convergence
solutionAverage <-
rowMeans(cbind(newsolution, matrix(
solution,
nrow = length(solution), ncol = 4
)))
change <- norm(as.matrix(solutionAverage - solution))
solution <- solutionAverage
iterations <- iterations + 1
changes <- c(changes, change)
}
if (verbose) {
print(round(solution / sum(solution), 5))
}
return(solution / sum(solution))
}
#' Solve WLS given a dampening constant
#'
#' @param S Signature matrix
#' @param B One column of the bulk data matrix, so one sample
#' @param goldStandard The average of all the solutions so far
#' @param j The dampening constant
#' @param verbose Whether to produce an output on the console
#'
#' @return A vector with the cell type numbers for the sample
#'
solveDampenedWLSj <-
function(S, B, goldStandard, j, verbose = FALSE) {
multiplier <- 1 * 2^(j - 1)
sol <- goldStandard
ws <- as.vector((1 / (S %*% sol))^2)
wsScaled <- ws / min(ws)
wsDampened <- wsScaled
wsDampened[which(wsScaled > multiplier)] <- multiplier
W <- diag(wsDampened)
D <- t(S) %*% W %*% S
d <- t(S) %*% W %*% B
A <- cbind(diag(dim(S)[2]))
bzero <- c(rep(0, dim(S)[2]))
sc <- norm(D, "2")
solution <- tryCatch(
{
quadprog::solve.QP(D / sc, d / sc, A, bzero)$solution
},
error = function(cond) {
message(
"Error solving the quadratic programming problem. Your dataset might be too small. Run ",
"with verbose=TRUE to get the full error message."
)
if (verbose) {
stop(cond)
} else {
stop()
}
},
warning = function(cond) {
warning(cond)
}
)
names(solution) <- colnames(S)
return(solution)
}
#' Finding a dampening constant for the weights using cross-validation
#'
#' @param S Signature matrix
#' @param B One column of the bulk data matrix, so one sample
#' @param goldStandard The solution found with OLS
#' @param verbose Whether to produce an output on the console
#'
#' @return The dampening constant (integer)
findDampeningConstant <- function(S, B, goldStandard, verbose = FALSE) {
solutionsSd <- NULL
# goldStandard is used to define the weights
sol <- goldStandard
ws <- as.vector((1 / (S %*% sol))^2)
wsScaled <- ws / min(ws)
wsScaledMinusInf <- wsScaled
# ignore infinite weights
if (max(wsScaled) == "Inf") {
wsScaledMinusInf <- wsScaled[-which(wsScaled == "Inf")]
}
# try multiple values of the dampening constant (multiplier)
# for each, calculate the variance of the dampened weighted solution for a subset of genes
for (j in 1:ceiling(log2(max(wsScaledMinusInf)))) {
multiplier <- 1 * 2^(j - 1)
wsDampened <- wsScaled
wsDampened[which(wsScaled > multiplier)] <- multiplier
solutions <- NULL
seeds <- c(1:100)
for (i in 1:100) {
set.seed(seeds[i]) # make nondeterministic
subset <-
sample(length(ws), size = length(ws) * 0.5) # randomly select half of gene set
# solve dampened weighted least squares for subset
fit <- stats::lm(B[subset] ~ -1 + S[subset, , drop = FALSE], weights = wsDampened[subset])
sol <- fit$coef * sum(goldStandard) / sum(fit$coef)
solutions <- cbind(solutions, sol)
}
solutionsSd <-
cbind(solutionsSd, apply(solutions, 1, stats::sd))
}
# choose dampening constant that results in least cross-validation variance
j <- which.min(colMeans(solutionsSd^2))
return(j)
}
#' Solver using SVR
#'
#' @param S Signature matrix
#' @param B One column of the bulk data matrix, so one sample
#' @param verbose Whether to produce an output on the console
#'
#' @return A vector with the cell type proportions for the sample
#'
solveSVR <- function(S, B, verbose = FALSE) {
# scaling
ub <- max(c(as.vector(S), B)) # upper bound
lb <- min(c(as.vector(S), B)) # lower bound
Bs <- ((B - lb) / ub) * 2 - 1
Ss <- ((S - lb) / ub) * 2 - 1
# perform SVR
model <-
e1071::svm(
Ss,
Bs,
nu = 0.5,
scale = TRUE,
type = "nu-regression",
kernel = "linear",
cost = 1
)
coef <- t(model$coefs) %*% model$SV
coef[which(coef < 0)] <- 0
coef <- as.vector(coef)
names(coef) <- colnames(S)
if (verbose) {
print(round(coef / sum(coef), 5))
}
return(coef / sum(coef))
}
#' Performing DE analysis using Seurat
#'
#' When path = NULL, the generated files in the processes will not be saved and output
#'
#' @param scdata The single cell data matrix
#' @param id A Vector of the cell type annotations
#' @param path OPTIONAL path for saving generated files
#' @param verbose Whether to produce an output on the console
#'
#' @return List with the differentially expressed genes for each cell type
#'
DEAnalysis <- function(scdata, id, path = NULL, verbose = FALSE) {
uniqueIds <- unique(id)
list.DE.group <- as.list(rep(0, length(uniqueIds)))
expr_obj <-
Seurat::CreateSeuratObject(raw.data = as.data.frame(scdata), project = "DE")
expr_obj2 <- Seurat::SetIdent(expr_obj, ident.use = as.vector(id))
if (verbose) {
print("Calculating differentially expressed genes:")
}
for (i in uniqueIds) {
de_group <-
Seurat::FindMarkers(
object = expr_obj2,
ident.1 = i,
ident.2 = NULL,
only.pos = TRUE,
test.use = "bimod"
)
index <- which(uniqueIds == i)
list.DE.group[[index]] <- de_group
if (!is.null(path)) {
save(de_group, file = paste(path, "/de_", i, ".RData", sep = ""))
}
}
return(list.DE.group)
}
#' Building the signature matrix using Seurat
#'
#' When path = NULL, the generated files in the processes will not be saved and output.
#'
#' @param scdata The single cell data matrix
#' @param id A Vector of the cell type annotations
#' @param path OPTIONAL path for saving generated files
#' @param verbose Whether to produce an output on the console
#' @param diff.cutoff The FC cutoff
#' @param pval.cutoff The pValue cutoff
#'
#' @return The computed signature matrix
#'
buildSignatureMatrixUsingSeurat <- function(scdata,
id,
path = NULL,
verbose = FALSE,
diff.cutoff = 0.5,
pval.cutoff = 0.01) {
# perform differential expression analysis
list.DE.groups <- DEAnalysis(scdata, id, path, verbose)
numberofGenes <- c()
uniqueIds <- unique(id)
for (i in uniqueIds) {
if (file.exists(paste(path, "/de_", i, ".RData", sep = ""))) {
load(file = paste(path, "/de_", i, ".RData", sep = ""))
} else {
index <- which(uniqueIds == i)
de_group <- list.DE.groups[[index]]
}
DEGenes <-
rownames(de_group)[intersect(
which(de_group$p_val_adj < pval.cutoff),
which(de_group$avg_logFC > diff.cutoff)
)]
nonMir <- grep("MIR|Mir", DEGenes, invert = TRUE)
assign(
paste("cluster_lrTest.table.", i, sep = ""),
de_group[which(rownames(de_group) %in% DEGenes[nonMir]), ]
)
numberofGenes <- c(numberofGenes, length(DEGenes[nonMir]))
}
# need to reduce number of genes
# for each subset, order significant genes by decreasing fold change,
# choose between 50 and 200 genes
# choose matrix with lowest condition number
conditionNumbers <- c()
for (g in 50:200) {
Genes <- c()
j <- 1
for (i in uniqueIds) {
if (numberofGenes[j] > 0) {
temp <- paste("cluster_lrTest.table.", i, sep = "")
temp <- get(temp)
temp <- temp[order(temp$p_val_adj, decreasing = TRUE), ]
Genes <-
c(Genes, (rownames(temp)[1:min(g, numberofGenes[j])]))
}
j <- j + 1
}
Genes <- unique(Genes)
# make signature matrix
ExprSubset <- scdata[Genes, , drop = FALSE]
Sig <- NULL
for (i in uniqueIds) {
Sig <-
cbind(Sig, (apply(ExprSubset, 1, function(y) {
mean(y[which(id == i)])
})))
}
colnames(Sig) <- uniqueIds
conditionNumbers <- c(conditionNumbers, kappa(Sig))
}
# g is optimal gene number
g <- which.min(conditionNumbers) + min(49, numberofGenes - 1)
Genes <- c()
j <- 1
for (i in uniqueIds) {
if (numberofGenes[j] > 0) {
temp <- paste("cluster_lrTest.table.", i, sep = "")
temp <- get(temp)
temp <- temp[order(temp$p_val_adj, decreasing = TRUE), ]
Genes <-
c(Genes, (rownames(temp)[1:min(g, numberofGenes[j])]))
}
j <- j + 1
}
Genes <- unique(Genes)
ExprSubset <- scdata[Genes, , drop = FALSE]
Sig <- NULL
for (i in uniqueIds) {
Sig <-
cbind(Sig, (apply(ExprSubset, 1, function(y) {
mean(y[which(id == i)])
})))
}
colnames(Sig) <- uniqueIds
rownames(Sig) <- Genes
if (!is.null(path)) {
save(Sig, file = paste(path, "/Sig.RData", sep = ""))
}
return(Sig)
}
# Functions needed for DE
Mean.in.log2space <- function(x, pseudo.count) {
return(log2(mean(2^(x) - pseudo.count) + pseudo.count))
}
stat.log2 <- function(data.m, group.v, pseudo.count) {
# data.m=data.used.log2
log2.mean.r <-
stats::aggregate(t(data.m), list(as.character(group.v)), function(x) {
Mean.in.log2space(x, pseudo.count)
})
log2.mean.r <- t(log2.mean.r)
colnames(log2.mean.r) <-
paste("mean.group", log2.mean.r[1, ], sep = "")
log2.mean.r <- log2.mean.r[-1, ]
log2.mean.r <- as.data.frame(log2.mean.r)
log2.mean.r <- varhandle::unfactor(log2.mean.r) # from varhandle
log2.mean.r[, 1] <- as.numeric(log2.mean.r[, 1])
log2.mean.r[, 2] <- as.numeric(log2.mean.r[, 2])
log2_foldchange <- log2.mean.r$mean.group1 - log2.mean.r$mean.group0
results <- data.frame(cbind(
log2.mean.r$mean.group0,
log2.mean.r$mean.group1,
log2_foldchange
))
colnames(results) <- c("log2.mean.group0", "log2.mean.group1", "log2_fc")
rownames(results) <- rownames(log2.mean.r)
return(results)
}
v.auc <- function(data.v, group.v) {
prediction.use <- ROCR::prediction(data.v, group.v, 0:1)
perf.use <- ROCR::performance(prediction.use, "auc")
auc.use <- round(perf.use@y.values[[1]], 3)
return(auc.use)
}
m.auc <- function(data.m, group.v) {
AUC <- apply(data.m, 1, function(x) {
v.auc(x, group.v)
})
AUC[is.na(AUC)] <- 0.5
return(AUC)
}
#' Performing DE analysis using mast
#'
#' When path = NULL, the generated files in the processes will not be saved and output.
#'
#' @param scdata The single cell data matrix
#' @param id A Vector of the cell type annotations
#' @param path OPTIONAL path for saving generated files
#' @param verbose Whether to produce an output on the console.
#'
#' @return A list with the cell types and their differentially expressed genes
#'
DEAnalysisMAST <- function(scdata, id, path, verbose = FALSE) {
uniqueIds <- unique(id)
list_lrTest.table <- as.list(rep(0, length(uniqueIds)))
pseudo.count <- 0.1
data.used.log2 <- log2(scdata + pseudo.count)
colnames(data.used.log2) <- make.unique(colnames(data.used.log2))
diff.cutoff <- 0.5
for (i in uniqueIds) {
cells.symbol.list2 <- colnames(data.used.log2)[which(id == i)]
cells.coord.list2 <- match(cells.symbol.list2, colnames(data.used.log2))
cells.symbol.list1 <- colnames(data.used.log2)[which(id != i)]
cells.coord.list1 <- match(cells.symbol.list1, colnames(data.used.log2))
data.used.log2.ordered <-
cbind(data.used.log2[, cells.coord.list1], data.used.log2[, cells.coord.list2])
group.v <-
c(rep(0, length(cells.coord.list1)), rep(1, length(cells.coord.list2)))
# ouput
log2.stat.result <-
stat.log2(data.used.log2.ordered, group.v, pseudo.count)
Auc <- m.auc(data.used.log2.ordered, group.v)
bigtable <- data.frame(cbind(log2.stat.result, Auc))
de <- bigtable[bigtable$log2_fc > diff.cutoff, ]
if (verbose) {
dim(de)
}
if (dim(de)[1] > 1) {
data.1 <- data.used.log2[, cells.coord.list1, drop = FALSE]
data.2 <- data.used.log2[, cells.coord.list2, drop = FALSE]
genes.list <- rownames(de)
log2fold_change <- cbind(genes.list, de$log2_fc)
colnames(log2fold_change) <- c("gene.name", "log2fold_change")
counts <- as.data.frame(cbind(data.1[genes.list, ], data.2[genes.list, ]))
groups <- c(
rep("Cluster_Other", length(cells.coord.list1)),
rep(i, length(cells.coord.list2))
)
groups <- as.character(groups)
data_for_MIST <-
verbose_wrapper(verbose)(as.data.frame(cbind(
rep(rownames(counts), dim(counts)[2]),
reshape::melt(counts),
rep(groups, each = dim(counts)[1]),
rep(1, dim(counts)[1] * dim(counts)[2])
)))
colnames(data_for_MIST) <- c(
"gene",
"Subject.ID",
"Et",
"Population",
"Number.of.Cells"
)
vbeta <- data_for_MIST
vbeta.fa <-
verbose_wrapper(verbose)(
MAST::FromFlatDF(
vbeta,
idvars = c("Subject.ID"),
primerid = "gene",
measurement = "Et",
ncells = "Number.of.Cells",
geneid = "gene",
cellvars = c("Number.of.Cells", "Population"),
phenovars = c("Population"),
id = "vbeta all"
)
)
vbeta.1 <- subset(vbeta.fa, Number.of.Cells == 1)
# .3 mast
# utils::head(SummarizedExperiment::colData(vbeta.1)) ??
zlm.output <-
verbose_wrapper(verbose)(MAST::zlm(
~Population,
vbeta.1,
method = "bayesglm",
ebayes = TRUE
))
if (verbose) {
methods::show(zlm.output)
coefAndCI <- summary(zlm.output, logFC = TRUE)
print(coefAndCI)
}
zlm.lr <-
verbose_wrapper(verbose)(MAST::lrTest(zlm.output, "Population"))
zlm.lr_pvalue <- reshape::melt(zlm.lr[, , "Pr(>Chisq)"])
zlm.lr_pvalue <-
zlm.lr_pvalue[which(zlm.lr_pvalue$test.type == "hurdle"), ]
lrTest.table <-
merge(zlm.lr_pvalue, de, by.x = "primerid", by.y = "row.names")
colnames(lrTest.table) <-
c(
"gene",
"test.type",
"p_value",
paste("log2.mean.", "Cluster_Other", sep = ""),
paste("log2.mean.", i, sep = ""),
"log2fold_change",
"Auc"
)
cluster_lrTest.table <-
lrTest.table[rev(order(lrTest.table$Auc)), ]
# . 4 save results
index <- which(uniqueIds == i)
list_lrTest.table[[index]] <- cluster_lrTest.table
if (!is.null(path)) {
utils::write.csv(cluster_lrTest.table,
file = paste(path, "/", i, "_lr_test.csv", sep = "")
)
save(cluster_lrTest.table,
file = paste(path, "/", i, "_mist.RData", sep = "")
)
}
}
}
return(list_lrTest.table)
}
#' #' Building the signature matrix using mast
#'
#' When path = NULL, the generated files in the processes will not be saved and output.
#'
#' @param scdata The single cell data matrix
#' @param id A Vector of the cell type annotations
#' @param path OPTIONAL path for saving generated files
#' @param verbose Whether to produce an output on the console.
#' @param diff.cutoff The FC cutoff
#' @param pval.cutoff The pValue cutoff
#'
#' @return The computed signature matrix
#'
buildSignatureMatrixMAST <- function(scdata,
id,
path = NULL,
verbose = FALSE,
diff.cutoff = 0.5,
pval.cutoff = 0.01) {
# compute differentially expressed genes for each cell type
list.cluster.table <-
DEAnalysisMAST(scdata, id, path, verbose = verbose)
# for each cell type, choose genes in which FDR adjusted p-value is less than 0.01 and the
# estimated fold-change is greater than 0.5
numberofGenes <- c()
uniqueIds <- unique(id)
for (i in uniqueIds) {
if (file.exists(paste(path, "/", i, "_MIST.RData", sep = ""))) {
load(file = paste(path, "/", i, "_MIST.RData", sep = ""))
} else {
index <- which(uniqueIds == i)
cluster_lrTest.table <- list.cluster.table[[index]]
}
pvalue_adjusted <-
stats::p.adjust(
cluster_lrTest.table[, 3],
method = "fdr",
n = length(cluster_lrTest.table[, 3])
)
cluster_lrTest.table <-
cbind(cluster_lrTest.table, pvalue_adjusted)
DEGenes <-
cluster_lrTest.table$gene[intersect(
which(pvalue_adjusted < pval.cutoff),
which(cluster_lrTest.table$log2fold_change > diff.cutoff)
)]
# because Mir gene is usually not accurate
nonMir <- grep("MIR|Mir", DEGenes, invert = TRUE)
assign(
paste("cluster_lrTest.table.", i, sep = ""),
cluster_lrTest.table[which(cluster_lrTest.table$gene %in% DEGenes[nonMir]), ]
)
numberofGenes <- c(numberofGenes, length(DEGenes[nonMir]))
}
# need to reduce number of genes
# for each subset, order significant genes by decreasing fold change,
# choose between 50 and 200 genes
# for each, iterate and choose matrix with lowest condition number
conditionNumbers <- c()
for (g in 50:200) {
Genes <- c()
j <- 1
for (i in uniqueIds) {
if (numberofGenes[j] > 0) {
temp <- paste("cluster_lrTest.table.", i, sep = "")
temp <- get(temp)
temp <-
temp[order(temp$log2fold_change, decreasing = TRUE), ]
Genes <-
c(Genes, varhandle::unfactor(temp$gene[1:min(g, numberofGenes[j])]))
}
j <- j + 1
}
Genes <- unique(Genes)
# make signature matrix
ExprSubset <- scdata[Genes, , drop = FALSE]
Sig <- NULL
for (i in uniqueIds) {
Sig <-
cbind(Sig, (apply(ExprSubset, 1, function(y) {
mean(y[which(id == i)])
})))
}
colnames(Sig) <- uniqueIds
conditionNumbers <- c(conditionNumbers, kappa(Sig))
}
# g is optimal gene number
g <- which.min(conditionNumbers) + min(49, numberofGenes - 1)
Genes <- c()
j <- 1
for (i in uniqueIds) {
if (numberofGenes[j] > 0) {
temp <- paste("cluster_lrTest.table.", i, sep = "")
temp <- get(temp)
temp <- temp[order(temp$log2fold_change, decreasing = TRUE), ]
Genes <-
c(Genes, varhandle::unfactor(temp$gene[1:min(g, numberofGenes[j])]))
}
j <- j + 1
}
Genes <- unique(Genes)
ExprSubset <- scdata[Genes, , drop = FALSE]
Sig <- NULL
for (i in uniqueIds) {
Sig <-
cbind(Sig, (apply(ExprSubset, 1, function(y) {
mean(y[which(id == i)])
})))
}
colnames(Sig) <- uniqueIds
rownames(Sig) <- Genes
if (!is.null(path)) {
save(Sig, file = paste(path, "/Sig.RData", sep = ""))
}
return(Sig)
}
#' A wrapper function whether to suppress messages
#'
#' @param verbose Whether to produce an output on the console.
#'
#' @return A function which will suppress messages or not, depending on the verbose parameter
#'
verbose_wrapper <- function(verbose) {
return(function(method) {
if (!verbose) {
suppressMessages(method)
} else {
method
}
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.