#' Find Marker Genes from cell clusters.
#'
#' Try to identify marker genes across clusters performing Mann-Whitney U test.
#' DE genes are identified the expression in each cluster versus the all the other.
#'
#' @param data list; GFICF object
#' @param nt integer; Number of thread to use (default 2).
#' @param hvg boolean; Use only High Variable Genes (default is TRUE).
#' @param verbose boolean; Icrease verbosity.
#' @return The updated gficf object.
#' @import Matrix
#' @importFrom RcppParallel setThreadOptions
#'
#' @export
findClusterMarkers = function(data,nt=2,hvg=T,verbose=T)
{
if (is.null(data$community)) {stop("Please identify cluster first! Run clustcells function.")}
if (is.null(data$rawCounts)) {stop("No raw/normalized counts stored. You should have run gficf normalization with storeRaw = T")}
RcppParallel::setThreadOptions(numThreads = nt)
u = unique(data$embedded$cluster)
# Normalize if were not
cpms = normCounts(data$rawCounts,doc_proportion_max = 2,doc_proportion_min = 0,normalizeCounts = !data$param$normalized,verbose=verbose)
if (hvg)
{
tsmessage("... Detecting HVGs")
df = findVarGenes(cpms,fitMethod = "locfit",verbose = verbose)
df = df[order(df$FDR),]
ix = df$FDR<.1
tsmessage(paste("... Detected",sum(ix),"significant HVGs with an FDR < 10%"))
if(sum(ix)<=1000) {ix = 1:min(1000,nrow(df))}
cpms = cpms[df$gene[ix],]
rm(df)
}
cpms = as.matrix(cpms)
tsmessage("... Start identify marker genes")
res = NULL
#progress_for(n=0,tot = length(u),display = T)
for (i in 1:length(u)) {
cells.1 = which(data$embedded$cluster%in%u[i])
cells.2 = which(!data$embedded$cluster%in%u[i])
tmp = rcpp_parallel_WMU_test(matX = cpms[,cells.1],matY = cpms[,cells.2],printOutput = F)
tmp = data.frame(ens=rownames(cpms),log2FC=tmp[,2],p.value=tmp[,1],fdr=p.adjust(tmp[,1],method = "fdr"),stringsAsFactors = F)
tmp = subset(tmp,fdr<.05 & log2FC>0)
tmp = tmp[order(tmp$fdr,decreasing = F),]
tmp$cluster = u[i]
res = rbind(res,tmp)
#progress_for(n=i,tot = length(u),display = T)
}
res = res[order(res$log2FC,decreasing = T),]
rownames(res) = NULL
data$de.genes = res
return(data)
}
#' Find high variable genes following the approach
#' proposed by Chen et al. in BMC Genomics (2016)
#' Code adapted from https://github.com/hillas/scVEGs
#' @param data list; GFICF object
#' @param fitMethod charachter; Method to use to fit variance and mean expression relationship (loess or locfit).
#' @param verbose boolean; Increase verbosity.
#' @import Matrix
#' @importFrom locfit locfit locfit.robust lp
#' @importFrom MASS fitdistr
#'
findVarGenes = function(data,fitMethod="locfit",verbose=T)
{
m <- dim(data)[1]
std <- apply(data, 1, stats::sd)
avg <- Matrix::rowMeans(data)
cv <- std / avg
# over dispersion sigma (var = u(1 + u * sigma^2))
xdata <- (avg)
ydata <- log10(cv)
xdata <- xdata[is.na(ydata) != "TRUE"]
ydata <- ydata[is.na(ydata) != "TRUE"]
if (fitMethod=="loess"){
fitLoc <- stats::loess(formula = ydata ~ log10(x = xdata),data = data.frame("ydata"=ydata,"xdata"=xdata,stringsAsFactors = F),span = 0.8)
} else {
fitLoc <- locfit::locfit.robust(ydata ~ locfit::lp(log10(xdata), nn = .2))
}
xSeq <- seq(min(log10(xdata)), max(log10(xdata)), 0.005)
gapNum <- matrix(0, length(xSeq), 1)
for(i in 1:length(xSeq)) {
cdx <- which((log10(xdata) >= xSeq[i] - 0.05) & (log10(xdata) < xSeq[i] + 0.05))
gapNum[i,1] <- length(cdx)
}
cdx <- which(gapNum > m*0.005)
xSeq <- 10 ^ xSeq
ySeq <- predict(fitLoc,log10(xSeq))
yDiff <- diff(ySeq)
ix <- which(yDiff > 0 & log10(xSeq[-1]) > 0)
if(length(ix) == 0)
ix <- length(ySeq) - 1
xSeq_all <- 10^seq(min(log10(xdata)), max(log10(xdata)), 0.001)
xSeq <- xSeq[cdx[1]:ix[1] + 1]
ySeq <- ySeq[cdx[1]:ix[1] + 1]
b <- 1
a <- 0
df <- data.frame(x=xSeq, y = ySeq)
fit = stats::nls(y ~ 0.5 * log10(b / x + a), data = df, start=list(b = b,a = a), stats::nls.control(maxiter = 500), na.action = 'na.exclude')
newdf <- data.frame(x = xSeq_all)
ydataFit <- stats::predict(fit,newdata = newdf)
# Calculate CV difference
logX <- log10(xdata)
logXseq <- log10(xSeq_all)
cvDist <- matrix(0,length(xdata),1)
#progress_for(n=0,tot = length(logX),display = verbose)
for (i in 1:length(logX))
{
cx <- which(logXseq >= logX[i] - 0.2 & logXseq < logX[i] + 0.2)
tmp <- sqrt((logXseq[cx] - logX[i])^2 + (ydataFit[cx] - ydata[i])^2)
tx <- which.min(tmp)
if(logXseq[cx[tx]] > logX[i]) {
if(ydataFit[cx[tx]] > ydata[i]) {
cvDist[i] <- -1*tmp[tx]
} else {
cvDist[i] <- tmp[tx]
}
cvDist[i] <- -1*tmp[tx]
} else if (logXseq[cx[tx]] <= logX[i]) {
if(ydataFit[cx[tx]] < ydata[i]) {
cvDist[i] <- tmp[tx]
} else {
cvDist[i] <- -1*tmp[tx]
}
}
#progress_for(n=i,tot = length(logX),display = verbose)
}
cvDist <- log2(10^cvDist)
# use kernel density estimate to find the peak
dor <- stats::density(cvDist, kernel = "gaussian")
distMid <-dor$x[which.max(dor$y)]
dist2 <- cvDist - distMid
tmpDist <- c(dist2[dist2 <= 0], abs(dist2[dist2 < 0])) + distMid
distFit <- MASS::fitdistr(tmpDist, "normal")
res = data.frame(gene=rownames(data),
mean=avg,
"cv"=cv,
P=pnorm(cvDist, mean = distFit$estimate[1], sd = distFit$estimate[2], lower.tail = FALSE)
)
res$FDR <- stats::p.adjust(res$P, 'fdr')
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.