#' This function analyzes ST data based on clustering generated by cluster_counts_OL.
#' It calculates a list of genes specific to a certain cluster and identifies differentually expressed genes
#' across clusters
#'
#' @param counts non-negative numeric matrix containing gene counts,
#' rows correspond to spots, columns correspond to genes
#' @param ids data frame or matrix assigning spatial coordinates to the spots (see process_input() for details)
#' @param clustering list containing clustering information as returned by cluster_counts_OL (numeric vector assigning spots to clusters)
#' @param sig.level significance level for the t-test between clusters; genes with p-values above this threshold
#' will be removed from output; default 0.05
#' @param lasso.data output of build_lassos function
#' @param deg.weight numeric, weight of the ranking of genes by differential gene expression in the
#' overall ranking; default 2
#' @param lls.weight numeric, weight of the ranking of genes by lls fit in the
#' overall ranking; default 3
#' @param entropy.weightnumeric, weight of the ranking of genes by entropy in the
#' overall ranking; default 1
#' @param build.lasso logical, should lasso models be calculated? only used if lasso.data is NULL, default FALSE
#' @param gamma numeric, gamma value passed to build_lassos. Determines ration of sparsity and smoothness in the model, default 3
#' @param ncores integer, number of cores used for lasso calculation; default 4
#' @param reduce logical, default TRUE. Return only genes passing certain thresholds in each ranking metric? Large runtime improvement in case build.lasso is TRUE.
#' @param normalize logical, should counts be normalized with sctransform prior to differential expression testing? default FALSE
#' @param verbose logical, default TRUE
#' @return list with 2 entries:\cr
#' 1) differential genes - data frame containing information (name, ckuster, p-value, up-/downregulation) for differentially expressed genes. Ordered by gene rank.\cr
#' 2) lasso_data
#' @export
analyze_clustering <- function(counts, ids, clustering, sig.level = 0.001, lasso.data = NULL, deg.weight = 2, lls.weight = 3, entropy.weight = 1, build.lasso = FALSE, gamma = 3, ncores = 4, reduce = TRUE, normalize = FALSE, verbose = TRUE){
# parameter checks
if(verbose) cat("calculating gene-wise entropy...\t")
# find cluster-specific genes by calculating total RNA per cluster and gene
# use the lasso model if available
if(is.null(lasso.data)){
cluster.libs <- matrix(0, ncol=ncol(counts), nrow = length(unique(clustering)))
colnames(cluster.libs) <- colnames(counts)
rownames(cluster.libs) <- unique(clustering)
for(cl in rownames(cluster.libs)){
cluster.libs[cl,] <- colSums(counts[which(clustering == as.numeric(cl)),,drop=F])
}
}else{
cluster.libs <- matrix(0, ncol=ncol(lasso.data$counts), nrow = length(unique(clustering)))
colnames(cluster.libs) <- colnames(lasso.data$counts)
rownames(cluster.libs) <- unique(clustering)
# clustering is by default in the same order as columns in counts
# lasso order is probably different
for(cl in rownames(cluster.libs)){
names(clustering) <- rownames(counts)
cluster.libs[cl,] <- colSums(lasso.data$counts[which(clustering[rownames(lasso.data$counts)] == as.numeric(cl)),,drop=F])
}
}
# cannot handle negative expression for entropy
# shift expressions if necessary
if(any(cluster.libs < 0)){
cluster.libs <- cluster.libs - min(cluster.libs)
}
# reduce to genes that are not 0 overall
cluster.libs <- cluster.libs[, which(colSums(cluster.libs) > 0)]
# scale to 1
cluster.libs <- sweep(cluster.libs, 2, colSums(cluster.libs), "/")
# create entropy vector (entropy for each spot)
specific <- sapply(colnames(cluster.libs), function(x){
if(! sum(cluster.libs[,x]) > 0) return(NA)
# if there are 0 entries for a gene, these need to be excluded
if(any(cluster.libs[,x] == 0)){
entropy <- -sum(cluster.libs[-which(cluster.libs[,x] == 0), x] * log2(cluster.libs[-which(cluster.libs[,x] == 0), x]))
}else{
entropy <- -sum(cluster.libs[, x] * log2(cluster.libs[, x]))
}
return(entropy)
})
if(verbose) cat("done\n")
names(specific) <- colnames(cluster.libs)
# remove NA genes
if(any(is.na(specific))){
specific <- specific[-which(is.na(specific))]
}
pdf("entropies.pdf")
hist(specific, breaks = 100)
dev.off()
#if(reduce){
# # keep only genes with entropy smaller or equal to the mean value
# specific <- specific[which(specific <= summary(specific)[4])]
# if(verbose) cat(length(specific), " genes passed entropy threshold\n", sep = "")
#}else{
# if(verbose) cat(length(which(specific <= summary(specific)[3])), " genes below mean entropy\n", sep = "")
#}
if(verbose) cat("testing for differential gene expression...\t")
# implement testing with multtest for differentially expressed genes
# test each cluster against all other clusters at the same time
# normalize counts if required
if(normalize && !any(counts < 0)){
deg.counts <- normalize_counts(counts)
}else{
deg.counts <- counts
}
test.results <- list()
p.vals <- sapply(colnames(deg.counts), function(g, count.mat = deg.counts, labs = as.factor(clustering)){
summary(aov(count.mat[, g] ~ labs))[[1]][5][1,1]
})
# if test not possible. set the genes p-value to 1 (max)
if(any(is.na(p.vals))){
p.vals[is.na(p.vals)] <- 1
}
if(any(is.nan(p.vals))){
p.vals[is.nan(p.vals)] <- 1
}
# sort and store
p.vals <- sort(p.vals)
if(verbose) cat("done\n")
# right now all.genes is actually all genes, but if a significance cutoff
# were used above, this would be needed
#all.genes <- as.vector(sapply(test.results, function(x){names(x)}))
all.genes <- names(p.vals)
if(any(is.na(all.genes)))
all.genes <- all.genes[!is.na(all.genes)]
all.genes <- unique(all.genes)
if(verbose) {
if(reduce) cat("Assign genes to clusters and remove insignificant genes")
else cat("Assign genes to clusters")
}
# sort the clustering vector to have consistent order
clusters <- sort(unique(clustering))
if(reduce){
p.vals <- p.vals[p.vals <= sig.level]
}
# reduced information to minimum p-value, corresponding cluster, gene name and up/downregulation info
gene.info <- c()
for(i in 1:length(p.vals)){
g <- names(p.vals)[i]
p <- p.vals[i]
# assign genes to downregulated clusters only if that gene's mean expression in the
# remaining clusters is closer to the expression in the upregulated than in the
# downregulated cluster
regulations <- sapply(clusters, function(cl){mean(counts[which(clustering == cl),g]) - mean(counts[which(clustering != cl), g])})
cl <- clusters[which.max(regulations)]
reg <- 1
gene.info <- rbind(gene.info,
c(g, cl, p, reg)
)
}
# coerce to data frame, name columns and adjust variable types
gene.info <- as.data.frame(gene.info)
colnames(gene.info) <- c("gene", "cluster", "pVal", "regulation")
gene.info$pVal <- as.numeric(as.character(gene.info$pVal))
gene.info$regulation <- as.numeric(as.character(gene.info$regulation))
rownames(gene.info) <- as.character(gene.info$gene)
if(verbose){
cat("done\n")
if(reduce) cat(nrow(gene.info), " genes passed the significance threshold\n", sep = "")
else cat(sum(gene.info$pVal < sig.level), " genes below significance threshold\n", sep = "")
}
gene.info <- filter_genes(
gene.info,
specific,
lasso.data,
deg.weight = deg.weight,
lls.weight = lls.weight,
entropy.weight = entropy.weight,
build.lasso = build.lasso,
ncores = ncores,
gamma = gamma,
counts = counts,
ids = ids,
reduce = reduce,
verbose = verbose
)
if(is.null(gene.info)) return(NULL)
return(list(
differential_genes = gene.info$diff.genes,
lasso_data = gene.info$lasso
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.