#' @import stats grDevices graphics
NULL
#' @title Filter a counts matrix
#' @description Filter a counts matrix based on gene (row) and cell (column)
#' requirements.
#' @param counts A sparse read count matrix. The rows correspond to genes,
#' columns correspond to individual cells
#' @param min.lib.size Minimum number of genes detected in a cell. Cells with
#' fewer genes will be removed (default: 300)
#' @param max.lib.size Maximum number of genes detected in a cell. Cells with
#' more genes will be removed (default: 8000)
#' @param min.reads Minimum number of reads per gene. Genes with fewer reads
#' will be removed (default: 1)
#' @param min.detected Minimum number of cells a gene must be seen in. Genes
#' not seen in a sufficient number of cells will be removed (default: 1)
#' @param verbose Verbosity (default: TRUE)
#' @return a filtered read count matrix
#' @examples
#' data(pbmcA)
#' dim(pbmcA)
#' mat <- cleanCounts(pbmcA)
#' dim(mat)
#' @export
#' @importFrom Matrix Matrix colSums rowSums
cleanCounts <- function(
counts,
min.lib.size = 300,
max.lib.size = 8000,
min.reads = 1,
min.detected = 1,
verbose = TRUE
) {
if (!class(counts) %in% c("dgCMatrix", "dgTMatrix")) {
if(verbose) {
message('Converting to sparse matrix ...')
}
counts <- Matrix::Matrix(counts, sparse = TRUE)
}
if (verbose) {
message('Filtering matrix with ', ncol(counts), ' cells and ', nrow(counts), ' genes ...')
}
## filter out low-gene cells or potential doublets
ix_col <- Matrix::colSums(counts)
ix_col <- ix_col > min.lib.size & ix_col < max.lib.size
counts <- counts[, ix_col]
## remove genes that don't have many reads
counts <- counts[Matrix::rowSums(counts) > min.reads, ]
## remove genes that are not seen in a sufficient number of cells
counts <- counts[Matrix::rowSums(counts > 0) > min.detected, ]
if (verbose) {
message('Resulting matrix has ', ncol(counts), ' cells and ', nrow(counts), ' genes')
}
return(counts)
}
#' Normalizes counts to CPM
#'
#' Normalizes raw counts to log10 counts per million with pseudocount
#'
#' @param counts Read count matrix. The rows correspond to genes, columns
#' correspond to individual cells
#' @param depthScale Depth scaling. Using a million for CPM (default: 1e6)
#' @param verbose Verbosity (default: TRUE)
#'
#' @return a normalized matrix
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- normalizeCounts(cd)
#' }
#'
#' @export
#' @importFrom Matrix Matrix colSums t
normalizeCounts <- function(counts, depthScale=1e6, verbose=TRUE) {
if (!class(counts) %in% c("dgCMatrix", "dgTMatrix")) {
if(verbose) {
message('Converting to sparse matrix ...')
}
counts <- Matrix::Matrix(counts, sparse = TRUE)
}
if (verbose) {
message('Normalizing matrix with ', ncol(counts), ' cells and ', nrow(counts), ' genes')
}
counts <- Matrix::t(Matrix::t(counts) / Matrix::colSums(counts))
counts <- counts * depthScale
return(counts)
}
#' Normalize gene expression variance relative to transcriptome-wide expectations
#' (Modified from SCDE/PAGODA2 code)
#'
#' Normalizes gene expression magnitudes to with respect to its ratio to the
#' transcriptome-wide expectation as determined by local regression on expression magnitude
#'
#' @param counts Read count matrix. The rows correspond to genes, columns correspond to individual cells
#' @param gam.k Generalized additive model parameter; the dimension of the basis used to represent the smooth term (default: 5)
#' @param alpha Significance threshold (default: 0.05)
#' @param plot Whether to plot the results (default: FALSE)
#' @param use.unadjusted.pvals If true, will apply BH correction (default: FALSE)
#' @param do.par Whether to adjust par for plotting if plotting (default: TRUE)
#' @param max.adjusted.variance Ceiling on maximum variance after normalization to prevent infinites (default: 1e3)
#' @param min.adjusted.variance Floor on minimum variance after normalization (default: 1e-3)
#' @param verbose Verbosity (default: TRUE)
#' @param details If true, will return data frame of normalization parameters. Else will return normalized matrix.(default: FALSE)
#'
#' @return If details is true, will return data frame of normalization parameters. Else will return normalized matrix.
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' }
#'
#' @importFrom mgcv s
#'
#' @export
#'
normalizeVariance <- function(counts, gam.k=5, alpha=0.05, plot=FALSE, use.unadjusted.pvals=FALSE, do.par=TRUE, max.adjusted.variance=1e3, min.adjusted.variance=1e-3, verbose=TRUE, details=FALSE) {
if (!class(counts) %in% c("dgCMatrix", "dgTMatrix")) {
if(verbose) {
message('Converting to sparse matrix ...')
}
counts <- Matrix::Matrix(counts, sparse = TRUE)
}
mat <- Matrix::t(counts) ## make rows as cells, cols as genes
if(verbose) {
print("Calculating variance fit ...")
}
dfm <- log(Matrix::colMeans(mat))
dfv <- log(apply(mat, 2, var))
names(dfm) <- names(dfv) <- colnames(mat)
df <- data.frame(m=dfm, v=dfv)
vi <- which(is.finite(dfv))
if(length(vi)<gam.k*1.5) { gam.k=1 } ## too few genes
if(gam.k<2) {
if(verbose) {
print("Using lm ...")
}
m <- lm(v ~ m, data = df[vi,])
} else {
if(verbose) {
print(paste0("Using gam with k=", gam.k, "..."))
}
fm <- as.formula(sprintf("v ~ s(m, k = %s)", gam.k))
m <- mgcv::gam(fm, data = df[vi,])
}
df$res <- -Inf; df$res[vi] <- resid(m,type='response')
n.cells <- ncol(mat)
n.obs <- nrow(mat)
df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=F,log.p=T))
df$lpa <- bh.adjust(df$lp,log=TRUE)
df$qv <- as.numeric(qchisq(df$lp, n.cells-1, lower.tail = FALSE,log.p=TRUE)/n.cells)
if(use.unadjusted.pvals) {
ods <- which(df$lp<log(alpha))
} else {
ods <- which(df$lpa<log(alpha))
}
if(verbose) {
print(paste0(length(ods), ' overdispersed genes ... ' ))
}
df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v));
df$gsf[!is.finite(df$gsf)] <- 0;
if(plot) {
if(do.par) {
par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
}
smoothScatter(df$m,df$v,main='',xlab='log10[ magnitude ]',ylab='log10[ variance ]')
grid <- seq(min(df$m[vi]),max(df$m[vi]),length.out=1000)
lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
if(length(ods)>0) {
points(df$m[ods],df$v[ods],pch='.',col=2,cex=1)
}
smoothScatter(df$m[vi],df$qv[vi],xlab='log10[ magnitude ]',ylab='',main='adjusted')
abline(h=1,lty=2,col=8)
if(is.finite(max.adjusted.variance)) { abline(h=max.adjusted.variance,lty=2,col=1) }
points(df$m[ods],df$qv[ods],col=2,pch='.')
}
## variance normalize
norm.mat <- counts*df$gsf
if(!details) {
return(norm.mat)
} else {
## return normalization factor
return(list(mat=norm.mat, ods=ods, df=df))
}
}
## BH P-value adjustment with a log option
bh.adjust <- function(x, log = FALSE) {
nai <- which(!is.na(x))
ox <- x
x <- x[nai]
id <- order(x, decreasing = FALSE)
if(log) {
q <- x[id] + log(length(x)/seq_along(x))
} else {
q <- x[id]*length(x)/seq_along(x)
}
a <- rev(cummin(rev(q)))[order(id)]
ox[nai] <- a
ox
}
#' Dimensionality reduction by PCA
#'
#' Dimensionality reduction using PCA by computing principal components using highly variable genes
#'
#' @param mat Variance normalized gene expression matrix.
#' @param nGenes Number of most variable genes. (default: 1000)
#' @param nPcs Number of principal components. (default: 100)
#' @param verbose Verbosity (default: TRUE)
#' @param ... Additional parameters to pass to irlba
#'
#' @return Matrix with columns as cells and rows as principal component eigenvectors.
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' }
#'
#' @export
#'
getPcs <- function(mat, nGenes = min(nrow(mat), 1000), nPcs = 100, verbose=TRUE, ...) {
if(class(mat)!='matrix') {
mat <- as.matrix(mat)
}
if(verbose) {
print(paste0('Identifying top ', nPcs, ' PCs on ', nGenes, ' most variable genes ...'))
}
## get variable genes
if(nGenes < nrow(mat)) {
vgenes <- getVariableGenes(mat, nGenes)
mat <- mat[vgenes,]
}
## get PCs
pcs <- fastPca(t(mat), nPcs, center=TRUE, ...)
m <- t(pcs$l)
colnames(m) <- colnames(mat)
rownames(m) <- paste0('PC', seq_len(nPcs))
return(t(m))
}
#' Derive principal components by SVD
#'
#' @param m Matrix X
#' @param nPcs Number of principal components
#' @param tol irlba tol
#' @param scale Scale input
#' @param center Center input
#' @param ... Additional parameters to pass to irlba
#'
#' @export
#'
fastPca <- function(m, nPcs=2, tol=1e-10, scale=FALSE, center=TRUE, ...) {
if(scale||center) {
m <- scale(m, scale=scale, center=center)
}
a <- irlba::irlba(crossprod(m)/(nrow(m)-1), nu=0, nv=nPcs, tol=tol, ...)
a$l <- m %*% a$v
return(a)
}
#' Get variable genes
#'
#' @param mat Gene expression matrix
#' @param nGenes Number of genes
#'
#' @export
#'
getVariableGenes <- function(mat, nGenes) {
vi <- apply(mat, 1, var)
names(vi) <- rownames(mat)
vgenes <- names(sort(vi, decreasing=TRUE)[seq_len(nGenes)])
return(vgenes)
}
#' Get clusters by community detection on approximate nearest neighbors
#'
#' Group cells into clusters based on graph-based community detection on approximate nearest neighbors
#'
#' @param mat Matrix of cells as columns. Features as rows (such as PCs).
#' @param k K-nearest neighbor parameter.
#' @param method Community detection method from igraph. (default: igraph::cluster_walktrap)
#' @param verbose Verbosity (default: TRUE)
#' @param details Whether to return just community annotation or entended details including graph and graph modularity
#'
#' @return Vector of community annotations
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' com <- getComMembership(pcs, k=30)
#' }
#'
#' @export
#'
getComMembership <- function(mat, k, method=igraph::cluster_walktrap, verbose=TRUE, details=FALSE) {
if(verbose) {
print("finding approximate nearest neighbors ...")
}
knn <- RANN::nn2(mat, k=k)[[1]]
## convert to adjacency matrix
adj <- matrix(0, nrow(mat), nrow(mat))
rownames(adj) <- colnames(adj) <- rownames(mat)
invisible(lapply(seq_len(nrow(mat)), function(i) {
adj[i,rownames(mat)[knn[i,]]] <<- 1
}))
## convert to graph for clustering
if(verbose) {
print("calculating clustering ...")
}
g <- igraph::graph.adjacency(adj, mode="undirected")
g <- igraph::simplify(g)
km <- method(g)
if(verbose) {
mod <- igraph::modularity(km)
if(mod < 0.3) { print('WARNING') }
print(paste0('graph modularity: ', mod))
}
## community membership
com <- km$membership
names(com) <- km$names
com <- factor(com)
if(verbose) {
print("identifying cluster membership ...")
print(table(com))
}
if(details) {
return(list(com=com, mod=mod, g=g))
}
else {
return(com)
}
}
#' Get clusters by community detection on approximate nearest neighbors with subsampling
#'
#' Group cells into clusters based on graph-based community detection on approximate nearest neighbors for random subset of cells
#' For when getComMembership takes too long due to there being too many cells
#'
#' @param mat Matrix of cells as columns. Features as rows (such as PCs).
#' @param k K-nearest neighbor parameter.
#' @param nsubsample Number of cells in subset (default: ncol(mat)*0.5)
#' @param seed Random seed for reproducibility
#' @param vote Use neighbor voting system to annotate rest of cells not in subset. If false, will use machine-learning model. (default: FALSE)
#' @param method Community detection method from igraph. (default: igraph::cluster_walktrap)
#' @param verbose Verbosity (default: TRUE)
#'
#' @return Vector of community annotations
#'
#' @examples \dontrun{
#' data(pbmcA)
#' cd <- pbmcA
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' com <- getApproxComMembership(pcs, k=30, getApproxComMembership=1000)
#' }
#'
#' @export
getApproxComMembership <- function (mat, k, nsubsample = nrow(mat) * 0.5, method = igraph::cluster_walktrap,
seed = 0, vote = FALSE, verbose = TRUE)
{
if (verbose) {
print(paste0("Subsampling from ", nrow(mat), " cells to ",
nsubsample, " ... "))
}
set.seed(seed)
subsample <- sample(rownames(mat), nsubsample)
if (verbose) {
print("Identifying cluster membership for subsample ... ")
}
pcs.sub <- mat[subsample, ]
com.sub <- getComMembership(pcs.sub, k = k, method = method)
if (verbose) {
print("Imputing cluster membership for rest of cells ... ")
}
if (vote) {
data <- mat[subsample, ]
query <- mat[setdiff(rownames(mat), subsample), ]
knn <- RANN::nn2(data, query, k = k)[[1]]
rownames(knn) <- rownames(query)
com.nonsub <- unlist(apply(knn, 1, function(x) {
nn <- rownames(data)[x]
nn.com <- com.sub[nn]
return(names(sort(table(nn.com), decreasing = TRUE)[1]))
}))
com.all <- factor(c(com.sub, com.nonsub)[rownames(mat)])
}
else {
df.sub <- data.frame(celltype = com.sub, pcs.sub)
model <- MASS::lda(celltype ~ ., data = df.sub)
df.all <- data.frame(mat)
model.output <- stats::predict(model, df.all)
com.all <- model.output$class
names(com.all) <- rownames(df.all)
if (verbose) {
print("Model accuracy for subsample ...")
print(table(com.all[names(com.sub)] == com.sub))
}
}
return(com.all)
}
#' Linear discriminant analysis model
#'
#' Identifies components that maximally discriminate among groups using a linear discriminant analysis model
#'
#' @param mat Expression matrix with cells as columns, transferable features such as genes as rows.
#' @param com Community annotations
#' @param verbose Verbosity (default: TRUE)
#' @param nfeatures Number of features (genes) in LDA model (default: all)
#' @param random Wehther those features are random of chosen based on variance (most variable will be chosen by default)
#' @param retest Whether to retest model for accuracy
#'
#' @return LDA model
#'
#' @examples {
#' data(pbmcA)
#' cd <- pbmcA[, 1:500]
#' mat <- cleanCounts(cd)
#' mat <- normalizeVariance(mat)
#' pcs <- getPcs(mat)
#' com <- getComMembership(pcs, k=30)
#' model <- modelLda(mat, com)
#' }
#'
#' @export
#'
modelLda <- function(mat, com, nfeatures=nrow(mat), random=FALSE, verbose=TRUE, retest=TRUE) {
## filter to reduce feature modeling space
if(nfeatures < nrow(mat)) {
if(random) {
## random
set.seed(0)
vi <- sample(rownames(mat), nfeatures)
mat <- mat[vi,]
} else {
## most variable (assumes properly variance normalized matrix...fix)
vi <- getVariableGenes(mat, nfeatures)
mat <- mat[vi,]
}
}
df <- data.frame(celltype=com, as.matrix(t(mat)))
if(verbose) {
print("calculating LDA ...")
}
model <- MASS::lda(celltype ~ ., data=df)
if(retest) {
## predict our data based on model
model.output <- stats::predict(model, df)
if(verbose) {
print("LDA prediction accuracy ...")
print(table(model.output$class==com))
}
}
return(model)
}
#' Differential expression analysis (adapted from PAGODA2)
#'
#' @param cd A read count matrix. The rows correspond to genes, columns correspond to individual cells
#' @param cols Column/cell group annotations. Will perform one vs. all differential expression analysis.
#' @param verbose Verbosity
#'
#' @export
#'
getDifferentialGenes <- function(cd, cols, verbose=TRUE) {
cm <- t(cd)
## match matrix rownames (cells) and group annotations
if(!all(rownames(cm) %in% names(cols))) { warning("Cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
## determine a subset of cells that's in the cols and cols[cell]!=NA
valid.cells <- rownames(cm) %in% names(cols)[!is.na(cols)];
if(!all(valid.cells)) {
## take a subset of the count matrix
cm <- cm[valid.cells,]
}
## reorder cols
cols <- as.factor(cols[match(rownames(cm),names(cols))]);
cols <- as.factor(cols);
if(verbose) {
print(paste0("Running differential expression with ",length(levels(cols))," clusters ... "))
}
## modified from pagoda2
# run wilcoxon test comparing each group with the rest
# calculate rank per column (per-gene) average rank matrix
xr <- apply(cm, 2, function(foo) {
#foo[foo==0] <- NA
bar <- rank(foo)
#bar[is.na(foo)] <- 0
bar[foo==0] <- 0
bar
}); rownames(xr) <- rownames(cm)
##xr <- sparse_matrix_column_ranks(cm);
# calculate rank sums per group
grs <- do.call(rbind, lapply(levels(cols), function(g) Matrix::colSums(xr[cols==g,])))
rownames(grs) <- levels(cols); colnames(grs) <- colnames(xr)
##grs <- colSumByFac(xr,as.integer(cols))[-1,,drop=F]
# calculate number of non-zero entries per group
gnzz <- do.call(rbind, lapply(levels(cols), function(g) Matrix::colSums(xr[cols==g,]>0)))
rownames(gnzz) <- levels(cols); colnames(gnzz) <- colnames(xr)
#xr@x <- numeric(length(xr@x))+1
#gnzz <- colSumByFac(xr,as.integer(cols))[-1,,drop=F]
#group.size <- as.numeric(tapply(cols,cols,length));
#group.size <- as.numeric(tapply(cols,cols,length))[1:nrow(gnzz)]; group.size[is.na(group.size)]<-0; # trailing empty levels are cut off by colSumByFac
group.size <- as.numeric(table(cols))
# add contribution of zero entries to the grs
gnz <- (group.size-gnzz)
# rank of a 0 entry for each gene
#zero.ranks <- (nrow(xr)-diff(xr@p)+1)/2 # number of total zero entries per gene
zero.ranks <- apply(cm, 2, function(foo) {
bar <- rank(foo)
bar[foo==0][1]
})
ustat <- t((t(gnz)*zero.ranks)) + grs - group.size*(group.size+1)/2
# standardize
n1n2 <- group.size*(nrow(cm)-group.size);
# usigma <- sqrt(n1n2*(nrow(cm)+1)/12) # without tie correction
# correcting for 0 ties, of which there are plenty
#usigma <- sqrt(n1n2*(nrow(cm)+1)/12)
usigma <- sqrt((nrow(cm) +1 - (gnz^3 - gnz)/(nrow(cm)*(nrow(cm)-1)))*n1n2/12)
# standardized U value- z score
x <- t((ustat - n1n2/2)/usigma);
# correct for multiple hypothesis
x <- matrix(qnorm(bh.adjust(pnorm(as.numeric(abs(x)), lower.tail = FALSE,
log.p = TRUE), log = TRUE), lower.tail = FALSE, log.p = TRUE),
ncol = ncol(x)) * sign(x)
rownames(x) <- colnames(cm)
colnames(x) <- levels(cols)[1:ncol(x)]
# add fold change information
log.gene.av <- log2(Matrix::colMeans(cm));
group.gene.av <- do.call(rbind, lapply(levels(cols), function(g) Matrix::colSums(cm[cols==g,]>0))) / (group.size+1);
log2.fold.change <- log2(t(group.gene.av)) - log.gene.av;
# fraction of cells expressing
f.expressing <- t(gnzz / group.size);
max.group <- max.col(log2.fold.change)
if(verbose) {
print("Summarizing results ... ")
}
## summarize
ds <- lapply(1:ncol(x),function(i) {
r <- data.frame(Z=x[,i],M=log2.fold.change[,i],highest=max.group==i,fe=f.expressing[,i])
rownames(r) <- rownames(x)
r
})
names(ds)<-colnames(x)
return(ds)
}
## annotate clusters; old; needs major speed improvement
getClusterGeneInfo <- function(mat, com, verbose=FALSE, ncores=10) {
if(class(com)!='factor') {
com <- factor(com)
}
if(!all(colnames(mat) %in% names(com))) { warning("cluster vector doesn't specify groups for all of the cells, dropping missing cells from comparison")}
## determine a subset of cells that's in the cols and cols[cell]!=NA
valid.cells <- colnames(mat) %in% names(com)[!is.na(com)]
if(!all(valid.cells)) {
## take a subset of the count matrix
mat <- mat[, valid.cells]
}
## reorder cols
cols <- com
cols <- as.factor(cols[match(colnames(mat),names(cols))])
cols <- as.factor(cols)
## run simple wilcoxon test comparing each group with the rest
diffgenes <- lapply(levels(cols), function(g) {
if(verbose) {
print(paste0("Testing group ", g, " ... "))
}
x <- mat[,cols==g]
y <- mat[,cols!=g]
## aucs and pvalues
aucs.pvs <- parallel::mclapply(seq_len(nrow(x)), function(i) {
auc <- markerAuc(x[i,], y[i,])
pv <- wilcox.test(x[i,], y[i,], alternative="greater")$p.value
return(list(auc, pv))
}, mc.cores=ncores)
aucs <- sapply(aucs.pvs, function(x) x[[1]])
pvs <- sapply(aucs.pvs, function(x) x[[2]])
## assess average fold change
xm <- rowMeans(x)
ym <- rowMeans(y)
fc <- log2(xm/ym)
## percent expressing
pe <- rowSums(x>0)/ncol(x)
## z-score
zs <- abs(qnorm(1 - (pvs/2), lower.tail=FALSE))*sign(fc)
## store p value
df <- data.frame(p.value=pvs, marker.auc=aucs, log2.fold.change=fc, percent.expressing=pe, z.score=zs)
rownames(df) <- rownames(mat)
return(df)
})
names(diffgenes) <- levels(cols)
return(diffgenes)
}
## Test how well does a gene discriminates a population
## adapted from seurat package
## too slow; to do
markerAuc <- function(x, y) {
pred <- ROCR::prediction(
predictions = c(x, y),
labels = c(rep(x = 1, length(x)), rep(x = 0, length(y))),
label.ordering = 0:1
)
perf <- ROCR::performance(pred, measure = "auc")
auc <- perf@y.values[[1]]
return(auc)
}
## diffGenes is output of getDifferentialGenes
getMarkerGenes <- function(mat, com, diffGenes = NULL, upregulated.only=TRUE, z.threshold=3, auc.threshold=0.5, fe.threshold=0.5, M.threshold=0.1, verbose=TRUE, ncores=1) {
if(is.null(diffGenes)) {
ds <- getDifferentialGenes(mat, com)
} else {
ds <- diffGenes
}
if(upregulated.only) {
dg <- unique(unlist(lapply(ds, function(x) rownames(x)[x$Z>z.threshold])))
} else {
dg <- unique(unlist(lapply(ds, function(x) rownames(x)[abs(x$Z)>z.threshold])))
}
dg <- intersect(dg, rownames(mat))
mat <- mat[dg,]
## get info
df.info <- df.list <- getClusterGeneInfo(mat, com, verbose, ncores)
## filter
genes <- lapply(df.list, function(df) {
if(upregulated.only) {
df <- df[df$z.score > z.threshold,]
} else {
df <- df[abs(df$z.score) > z.threshold,]
}
df <- df[df$marker.auc > auc.threshold,]
df <- df[df$fe > fe.threshold,]
df <- df[df$M > M.threshold,]
return(rownames(df))
})
names(genes) <- names(df.list)
return(list(
genes=genes,
info=df.info
))
}
#' Iterative merging of clusters along tree until stable
#'
#' @param cd Counts matrix for differential expression analysis. Rows are genes. Columns are cells.
#' @param com Community/group annotations for cells
#' @param matnorm Normalized gene expression matrix for building group relationship tree. Rows are genes. Columns are cells.
#' @param z.threshold Z-score threshold for identifying significantly differentially expressed genes
#' @param hclust.method Hierarchical clustering method used to construct relationship tree
#' @param min.group.size Minimum group size for stable cluster
#' @param min.diff.genes Minimum number of significantly differentially expressed genes that must be identified or else groups will be merged
#' @param max.iter Maximum number of iterations. Will end earlier if convergence reached.
#' @param removeGenes Regex expression matching genes you want to remove such as ribosomal or mitochondrial genes.
#' @param plot Whether to plot intermediate plots (hierarchical clustering dendrograms)
#' @param verbose Verbosity
#'
#' @export
#'
getStableClusters <- function(cd, com, matnorm, z.threshold=3, hclust.method='ward.D', min.group.size=10, min.diff.genes=nrow(cd)*0.005, max.iter=10, removeGenes = "^RP|^MT", verbose=TRUE, plot=FALSE) {
if(min.group.size>1) {
com[com %in% levels(com)[table(com)<min.group.size]] <- NA
}
com <- as.factor(com[colnames(cd)])
## compute differentially expressed genes between two branches of dendrogram
compare <- function(dend) {
g1 <- labels(dend[[1]])
g2 <- labels(dend[[2]])
incom <- names(com)[com %in% g1]
outcom <- names(com)[com %in% g2]
com.sub <- factor(c(
rep(paste0(g1, collapse="."), length(incom)),
rep(paste0(g2, collapse="."), length(outcom))))
names(com.sub) <- c(incom, outcom)
dg <- getDifferentialGenes(cd[, names(com.sub)], com.sub, verbose=verbose)
dg.sig <- lapply(dg, function(x) {
## get only significantly upregulated
x <- x[x$Z>z.threshold,]
x <- na.omit(x)
return(x)
})
if(!is.null(removeGenes)) {
dg.sig <- lapply(dg.sig, function(dg) {
dg <- dg[!grepl(removeGenes, rownames(dg)), ] ## exclude ribosomal and mitochondrial
})
}
return(dg.sig)
}
## recursively trace dendrogram
pv.recur <- function(dend) {
## compares leaves of tree (groups)
if(is.leaf(dend[[1]]) & is.leaf(dend[[2]])) {
g1 <- paste0(labels(dend[[1]]), collapse=":")
g2 <- paste0(labels(dend[[2]]), collapse=":")
if(verbose) {
print(paste0('Comparing ', g1, ' and ', g2))
}
## if insufficient number of marker genes distinguishing two groups, merge
dg.sig <- compare(dend)
if(verbose) {
print('Differential genes found: ')
print(sapply(dg.sig, length))
}
## both leaves need markers
#if(any(sapply(dg.sig, length) < min.diff.genes)) {
if(length(unlist(dg.sig)) < min.diff.genes) {
if(verbose) {
print(paste0('Merging ', g1, ' and ', g2))
}
com.fin[com.fin==g1] <<- paste0(g1,'.', g2)
com.fin[com.fin==g2] <<- paste0(g1,'.', g2)
}
pv.sig.all[[g1]] <<- dg.sig
pv.sig.all[[g2]] <<- dg.sig
} else {
## if only one is leaf, compare to entire other branch
if(is.leaf(dend[[1]])) {
g1 <- paste0(labels(dend[[1]]), collapse=".")
g2 <- paste0(labels(dend[[2]]), collapse=".")
if(verbose) {
print(paste0('Comparing ', g1, ' and ', g2))
}
dg.sig <- compare(dend)
if(verbose) {
print('Differential genes found: ')
print(sapply(dg.sig, nrow))
}
## if insufficient number of marker genes for leaf, set to NA
#if(any(sapply(dg.sig, length) < min.diff.genes)) {
#if(length(unlist(dg.sig)) < min.diff.genes) {
if(nrow(dg.sig[[labels(dend[[1]])]]) < min.diff.genes) {
if(verbose) {
print(paste0('Cannot distinguish ', g1, ' and ', g2))
}
com.fin[com.fin==g1] <<- NA
}
pv.sig.all[[g1]] <<- dg.sig
} else {
pv.recur(dend[[1]])
}
if(is.leaf(dend[[2]])) {
g1 <- paste0(labels(dend[[1]]), collapse=".")
g2 <- paste0(labels(dend[[2]]), collapse=".")
if(verbose) {
print(paste0('Comparing ', g1, ' and ', g2))
}
dg.sig <- compare(dend)
if(verbose) {
print('Differential genes found: ')
print(sapply(dg.sig, nrow))
}
#if(any(sapply(dg.sig, length) < min.diff.genes)) {
#if(length(unlist(dg.sig)) < min.diff.genes) {
if(nrow(dg.sig[[labels(dend[[2]])]]) < min.diff.genes) {
if(verbose) {
print(paste0('Cannot distinguish ', g1, ' and ', g2))
}
com.fin[com.fin==g2] <<- NA
}
pv.sig.all[[g2]] <<- dg.sig
} else {
pv.recur(dend[[2]])
}
}
}
i <- 1 ## counter
repeat {
com <- factor(com)
## average within groups
mat.summary <- do.call(cbind, lapply(levels(com), function(ct) {
cells <- which(com==ct)
if(length(cells) > 1) {
Matrix::rowMeans(matnorm[, cells])
} else {
matnorm[,cells]
}
}))
colnames(mat.summary) <- levels(com)
## cluster groups
hc <- hclust(dist(t(mat.summary)), method=hclust.method)
if(plot) { plot(hc) }
dend <- as.dendrogram(hc)
com.fin <- as.character(com)
names(com.fin) <- names(com)
pv.sig.all <- list()
pv.recur(dend)
## test if converged
if(sum(com==com.fin, na.rm=TRUE)>=min(sum(!is.na(com)), sum(!is.na(com.fin)))) {
## compute one last time
com.fin <- factor(com.fin)
mat.summary <- do.call(cbind, lapply(levels(com.fin), function(ct) {
cells <- which(com.fin==ct)
if(length(cells) > 1) {
Matrix::rowMeans(matnorm[, cells])
} else {
matnorm[,cells]
}
}))
colnames(mat.summary) <- levels(com.fin)
hc <- hclust(dist(t(mat.summary)), method=hclust.method)
if(plot) { plot(hc) }
dend <- as.dendrogram(hc)
## exit
break
}
if(i>max.iter) {
break
}
com <- com.fin
i <- i+1
}
return(list(com=com.fin, pv.sig=pv.sig.all, hc=hc, mat.summary=mat.summary))
}
#' Predict LD embedding for new dataset given old model and gene scale factor
#'
#' @param mat Library-size normalized gene expression matrix
#' @param model LDA model
#' @param gsf Gene scale factor to be applied to mat (so mat must not be variance normalized)
#' @param verbose Verbosity
#'
#' @export
#'
predictLds <- function(mat, model, gsf, verbose=TRUE) {
gsf.have <- intersect(names(gsf), rownames(mat))
if(verbose) {
print(paste0('Percentage of features retained: ', length(gsf.have)/length(gsf)))
}
## fill in with zeros
mat.test <- matrix(0, length(gsf), ncol(mat))
rownames(mat.test) <- names(gsf)
colnames(mat.test) <- colnames(mat)
mat.test[gsf.have,] <- as.matrix(mat[gsf.have,])
pred <- stats::predict(model, data.frame(t(log10(mat.test*gsf+1))))
}
#' Use LDA model posteriors to retain only confident predictions
#'
#' @param posterior Posterior probabilities from LDA
#' @param t Posteriors below this threshold are set to NA
#'
#' @export
#'
getConfidentPreds <- function(posterior, t=0.95) {
class <- apply(posterior, 1, function(x) {
x <- sort(x, decreasing=TRUE)
if(x[1]-sum(x[-1]) >= t) {
return(names(x)[1])
} else {
return(NA)
}
})
class <- factor(class)
return(class)
}
#' Merge a list of group annotations
#'
#' @param pred.com List of group annotations
#' @param min.group.size Minumum group size
#' @param t Percentage increase in entropy that must be achieved to merge annotations
#' @param verbose Verbosity
#'
#' @export
#'
mergePredsList <- function(pred.com, min.group.size=30, t=0.1, verbose=TRUE) {
## need to be named to sort
if(is.null(names(pred.com))) {
names(pred.com) <- seq_len(length(pred.com))
}
## order from greatest number of distinct groups to least
order <- sort(sapply(pred.com, function(x) length(table(x))), decreasing=TRUE)
pred.com <- pred.com[names(order)]
## merge only if it increases entropy (adds more information)
preds.com.init <- pred.com[[1]]
for(i in 2:length(pred.com)) {
com.test1 <- preds.com.init
entropy1 <- entropy::entropy(table(com.test1))
com.test2 <- pred.com[[i]]
bar <- cbind(as.character(com.test1), as.character(com.test2[names(com.test1)]))
rownames(bar) <- names(com.test1)
bar <- na.omit(bar)
foo <- apply(bar, 1, paste, collapse=".")
names(foo) <- rownames(bar)
foo[foo %in% names(which(table(foo)<min.group.size))] <- NA
entropy2 <- entropy::entropy(table(foo))
if(verbose) {
print("Former entropy:")
print(entropy1)
print("New entropy:")
print(entropy2)
}
if((entropy2 - entropy1) > entropy1*t) {
preds.com.init <<- foo
if(verbose) {
print("Merge")
print(table(foo))
}
}
}
com.all <- factor(preds.com.init)
return(com.all)
}
#' Batch correct within identified groups using ComBat
#'
#' @param lds.all Matrix to be batch corrected
#' @param batch Batch factor annotations
#' @param com.final Group annotations
#' @param min.group.size Minimum number of cells in a group in order to batch correct
#' @export
#' @importFrom sva ComBat
clusterBasedBatchCorrect <- function(lds.all, batch, com.final, min.group.size=10) {
com.final <- factor(com.final)
nas <- names(which(is.na(com.final)))
lds.bc <- do.call(rbind, lapply(levels(com.final), function(ct){
cells <- na.omit(names(com.final)[com.final==ct])
batch.cells <- factor(batch[cells])
## correct only if more than N cells per group
if(sum(table(batch.cells)>min.group.size)>1) {
t(sva::ComBat(t(lds.all[cells,]), batch.cells))
} else {
lds.all[cells,]
}
}))
lds.bc <- rbind(lds.bc, lds.all[nas,])
return(lds.bc)
}
#' Run tSNE on LDs from model
#'
#' @param mat Normalized matrix
#' @param model LDA model
#' @param perplexity Perplexity parameter for tSNE
#' @param verbose Verbosity
#' @param plot Whether to plot
#' @param do.par Whether to set plot margins
#' @param ncores Number of cores for paralele tSNE
#' @param details Whether to return details
#' @param ... Additional parameters to pass to plot
#'
#' @export
#'
tsneLda <- function(mat, model, perplexity=30, verbose=TRUE, plot=TRUE, do.par=TRUE, ncores=10, details=FALSE, ...) {
if(verbose) {
print('Running LDA models ...')
}
## compute LDs
matm <- t(as.matrix(mat))
if(class(model)=='lda') {
genes.need <- rownames(model$scaling)
mat.temp <- matrix(0, nrow(matm), length(genes.need))
rownames(mat.temp) <- rownames(matm)
colnames(mat.temp) <- genes.need
genes.have <- intersect(genes.need, colnames(matm))
mat.temp[rownames(matm), genes.have] <- matm[, genes.have]
stats::predict(model, data.frame(mat.temp))$x
}
if(class(model)=='list') {
reduction <- do.call(cbind, lapply(model, function(m) {
genes.need <- rownames(m$scaling)
mat.temp <- matrix(0, nrow(matm), length(genes.need))
rownames(mat.temp) <- rownames(matm)
colnames(mat.temp) <- genes.need
genes.have <- intersect(genes.need, colnames(matm))
mat.temp[rownames(matm), genes.have] <- matm[, genes.have]
stats::predict(m, data.frame(mat.temp))$x
}))
}
if(verbose) {
print(paste0("Running Rtsne with perplexity ", perplexity))
}
## tSNE
emb <- Rtsne::Rtsne(reduction, is_distance=FALSE, perplexity=perplexity, verbose=verbose, num_threads=ncores)$Y
rownames(emb) <- colnames(mat)
if(plot) {
if(do.par) {
par(mar = c(0.5,0.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0);
}
plotEmbedding(emb, ...)
}
if(details) {
return(list(
emb=emb,
reduction=reduction
))
} else {
return(emb)
}
}
#' Plot 2D embedding
#'
#' @param emb dataframe with x and y coordinates
#' @param groups factor annotations for rows on emb for visualizing cluster annotations
#' @param colors color or numeric values for rows on emb for visualizing gene expression
#' @param cex point size
#' @param alpha point opacity
#' @param gradientPalette palette for colors if numeric values provided
#' @param zlim range for colors
#' @param s saturation of rainbow for group colors
#' @param v value of rainbow for group colors
#' @param min.group.size minimum size of group in order for group to be colored
#' @param show.legend whether to show legend
#' @param mark.clusters whether to mark clusters with name of cluster
#' @param mark.cluster.cex cluster marker point size
#' @param shuffle.colors whether to shuffle group colors
#' @param legend.x legend position ie. 'topright', 'topleft', 'bottomleft', 'bottomright'
#' @param gradient.range.quantile quantile for mapping colors to gradient palette
#' @param verbose verbosity
#' @param unclassified.cell.color cells not included in groups will be labeled in this color
#' @param group.level.colors set group level colors. Default uses rainbow.
#' @param ... Additional parameters to pass to BASE::plot
#'
#' @export
plotEmbedding <- function(emb, groups=NULL, colors=NULL, cex=0.6, alpha=0.4, gradientPalette=NULL, zlim=NULL, s=1, v=0.8, min.group.size=1, show.legend=FALSE, mark.clusters=FALSE, mark.cluster.cex=2, shuffle.colors=F, legend.x='topright', gradient.range.quantile=0.95, verbose=TRUE, unclassified.cell.color='gray70', group.level.colors=NULL, ...) {
if(!is.null(colors)) {
## use clusters information
if(!all(rownames(emb) %in% names(colors))) { warning("provided cluster vector doesn't list colors for all of the cells; unmatched cells will be shown in gray. ")}
if(all(areColors(colors))) {
if(verbose) cat("using supplied colors as is\n")
cols <- colors[match(rownames(emb),names(colors))]; cols[is.na(cols)] <- unclassified.cell.color;
names(cols) <- rownames(emb)
} else {
if(is.numeric(colors)) { # treat as a gradient
if(verbose) cat("treating colors as a gradient")
if(is.null(gradientPalette)) { # set up default gradients
if(all(sign(colors)>=0)) {
gradientPalette <- colorRampPalette(c('gray80','red'), space = "Lab")(1024)
} else {
gradientPalette <- colorRampPalette(c("blue", "grey70", "red"), space = "Lab")(1024)
}
}
if(is.null(zlim)) { # set up value limits
if(all(sign(colors)>=0)) {
zlim <- as.numeric(quantile(colors,p=c(1-gradient.range.quantile,gradient.range.quantile)))
if(diff(zlim)==0) {
zlim <- as.numeric(range(colors))
}
} else {
zlim <- c(-1,1)*as.numeric(quantile(abs(colors),p=gradient.range.quantile))
if(diff(zlim)==0) {
zlim <- c(-1,1)*as.numeric(max(abs(colors)))
}
}
}
# restrict the values
colors[colors<zlim[1]] <- zlim[1]; colors[colors>zlim[2]] <- zlim[2];
if(verbose) cat(' with zlim:',zlim,'\n')
colors <- (colors-zlim[1])/(zlim[2]-zlim[1])
cols <- gradientPalette[colors[match(rownames(emb),names(colors))]*(length(gradientPalette)-1)+1]
names(cols) <- rownames(emb)
} else {
stop("colors argument must be a cell-named vector of either character colors or numeric values to be mapped to a gradient")
}
}
} else {
if(!is.null(groups)) {
if(min.group.size>1) { groups[groups %in% levels(groups)[unlist(tapply(groups,groups,length))<min.group.size]] <- NA; groups <- droplevels(groups); }
groups <- as.factor(groups)[rownames(emb)]
if(verbose) cat("using provided groups as a factor\n")
factor.mapping=TRUE;
## set up a rainbow color on the factor
factor.colors <- fac2col(groups,s=s,v=v,shuffle=shuffle.colors,min.group.size=min.group.size,unclassified.cell.color=unclassified.cell.color,level.colors=group.level.colors,return.details=T)
cols <- factor.colors$colors;
names(cols) <- rownames(emb)
} else {
cols <- rep(unclassified.cell.color, nrow(emb))
names(cols) <- rownames(emb)
}
}
plot(emb,col=adjustcolor(cols,alpha.f=alpha),cex=cex,pch=19,axes=F, ...); box();
if(mark.clusters) {
if(!is.null(groups)) {
cent.pos <- do.call(rbind,tapply(1:nrow(emb),groups,function(ii) apply(emb[ii,,drop=F],2,median)))
cent.pos <- na.omit(cent.pos);
text(cent.pos[,1],cent.pos[,2],labels=rownames(cent.pos),cex=mark.cluster.cex)
}
}
if(show.legend) {
if(factor.mapping) {
legend(x=legend.x,pch=rep(19,length(levels(groups))),bty='n',col=factor.colors$palette,legend=names(factor.colors$palette))
}
}
}
## a utility function to translate factor into colors
fac2col <- function(x,s=1,v=1,shuffle=FALSE,min.group.size=1,return.details=F,unclassified.cell.color='gray50',level.colors=NULL) {
x <- as.factor(x);
if(min.group.size>1) {
x <- factor(x,exclude=levels(x)[unlist(tapply(rep(1,length(x)),x,length))<min.group.size])
x <- droplevels(x)
}
if(is.null(level.colors)) {
col <- rainbow(length(levels(x)),s=s,v=v);
} else {
col <- level.colors[1:length(levels(x))];
}
names(col) <- levels(x);
if(shuffle) col <- sample(col);
y <- col[as.integer(x)]; names(y) <- names(x);
y[is.na(y)] <- unclassified.cell.color;
if(return.details) {
return(list(colors=y,palette=col))
} else {
return(y);
}
}
## quick utility to check if given character vector is colors
## thanks, Josh O'Brien: http://stackoverflow.com/questions/13289009/check-if-character-string-is-a-valid-color-representation
areColors <- function(x) {
is.character(x) &
sapply(x, function(X) {
tryCatch(is.matrix(col2rgb(X)), error = function(e) FALSE)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.