#' Calculates an approximation of the second derivative of a set of points
#'
#' @description Calculates an approximation of the second derivative of a set of points.
#' The maximum second derivative will be a good choice for the inflection point (the elbow or knee).
#' From https://rdrr.io/cran/gama/src/R/bestk.R
#' https://stackoverflow.com/questions/2018178/finding-the-best-trade-off-point-on-a-curve
#' https://raghavan.usc.edu/papers/kneedle-simplex11.pdf (Finding a "Kneedle" in a Haystack:
#' Detecting Knee Points in System Behavior)
#' @param points vector of perplexity values ordered by the corresponding LDA model K
#'
#' @return the best K (i.e. inflection point of the perplexities)
#'
#' @noRd
where.is.knee <- function(points=NULL) {
lower.limit <- 2
upper.limit <- length(points) -1
second.derivative <- sapply(lower.limit:upper.limit, function(i){
points[i+1] + points[i-1] - (2 * points[i]) })
w.max <- which.max(second.derivative)
best.k <- w.max +1
return(best.k)
}
#' Filter a counts matrix
#'
#' @description Filter a counts matrix based on gene (row) and cell (column)
#' requirements.
#'
#' @param counts A sparse read count matrix. The rows correspond to genes,
#' columns correspond to individual cells
#' @param min.lib.size Minimum number of genes detected in a cell. Cells with
#' fewer genes will be removed (default: 1)
#' @param max.lib.size Maximum number of genes detected in a cell. Cells with
#' more genes will be removed (default: Inf)
#' @param min.reads Minimum number of reads per gene. Genes with fewer reads
#' will be removed (default: 1)
#' @param min.detected Minimum number of cells a gene must be seen in. Genes
#' not seen in a sufficient number of cells will be removed (default: 1)
#' @param verbose Verbosity (default: FALSE)
#' @param plot Whether to plot (default: FALSE)
#'
#' @return a filtered read count matrix
#'
#' @examples
#' data(mOB)
#' counts <- cleanCounts(mOB$counts, min.lib.size = 100)
#'
#' @importFrom graphics hist par
#'
#' @export
#'
cleanCounts <- function(counts,
min.lib.size=1,
max.lib.size=Inf,
min.reads=1,
min.detected=1,
verbose=FALSE,
plot=FALSE) {
if (!any(class(counts) %in% c("dgCMatrix", "dgTMatrix"))) {
if (verbose) {
message("Converting to sparse matrix ...")
}
counts <- Matrix::Matrix(counts, sparse=TRUE)
}
if (verbose) {
message("Filtering matrix with ", ncol(counts), " cells and ",
nrow(counts), " genes ...")
}
ix_col <- Matrix::colSums(counts)
ix_col <- ix_col > min.lib.size & ix_col < max.lib.size
counts <- counts[, ix_col]
counts <- counts[Matrix::rowSums(counts) > min.reads, ]
counts <- counts[Matrix::rowSums(counts > 0) > min.detected, ]
if (verbose) {
message("Resulting matrix has ", ncol(counts), " cells and ", nrow(counts), " genes")
}
if (plot) {
par(mfrow=c(1,2), mar=rep(5,4))
hist(log10(Matrix::colSums(counts)+1), breaks=20, main='Genes Per Dataset')
hist(log10(Matrix::rowSums(counts)+1), breaks=20, main='Datasets Per Gene')
}
return(counts)
}
#' Normalize gene expression variance relative to transcriptome-wide expectations
#' (Modified from SCDE/PAGODA2 code)
#'
#' @description Normalizes gene expression magnitudes to with respect to its ratio to the
#' transcriptome-wide expectation as determined by local regression on expression magnitude
#'
#' @param counts Read count matrix. The rows correspond to genes, columns correspond to individual cells
#' @param gam.k Generalized additive model parameter; the dimension of the basis used to represent the smooth term (default: 5)
#' @param alpha Significance threshold (default: 0.05)
#' @param plot Whether to plot the results (default: FALSE)
#' @param use.unadjusted.pvals If true, will apply BH correction (default: FALSE)
#' @param do.par Whether to adjust par for plotting if plotting (default: TRUE)
#' @param max.adjusted.variance Ceiling on maximum variance after normalization to prevent infinites (default: 1e3)
#' @param min.adjusted.variance Floor on minimum variance after normalization (default: 1e-3)
#' @param verbose Verbosity (default: TRUE)
#' @param details If true, will return data frame of normalization parameters. Else will return list of overdispersed genes. (default: FALSE)
#'
#' @return If details is true, will return data frame of normalization parameters. Else will return list of overdispersed genes.
#'
#' @examples
#' data(mOB)
#' od <- getOverdispersedGenes(counts = mOB$counts, gam.k = 5, alpha = 0.05, details = FALSE)
#' head(od)
#'
#' od <- getOverdispersedGenes(counts = mOB$counts, gam.k = 5, alpha = 0.05, details = TRUE)
#' head(od$mat)
#' head(od$ods)
#' head(od$df)
#'
#' @importFrom mgcv s gam
#' @importFrom stats lm as.formula pf predict qchisq resid var
#' @importFrom graphics par
#'
#' @export
getOverdispersedGenes <- function(counts,
gam.k=5,
alpha=0.05,
plot=FALSE,
use.unadjusted.pvals=FALSE,
do.par=TRUE,
max.adjusted.variance=1e3,
min.adjusted.variance=1e-3,
verbose=TRUE, details=FALSE) {
if (!any(class(counts) %in% c("dgCMatrix", "dgTMatrix"))) {
if(verbose) {
message('Converting to sparse matrix ...')
}
counts <- Matrix::Matrix(counts, sparse = TRUE)
}
mat <- Matrix::t(counts) ## make rows as cells, cols as genes
if(verbose) {
message("Calculating variance fit ...")
}
dfm <- log(Matrix::colMeans(mat))
dfv <- log(apply(mat, 2, var))
names(dfm) <- names(dfv) <- colnames(mat)
df <- data.frame(m=dfm, v=dfv)
vi <- which(is.finite(dfv))
if(length(vi)<gam.k*1.5) { gam.k=1 } ## too few genes
if(gam.k<2) {
if(verbose) {
message("Using lm ...")
}
m <- lm(v ~ m, data = df[vi,])
} else {
if(verbose) {
message(paste0("Using gam with k=", gam.k, "..."))
}
fm <- as.formula(sprintf("v ~ s(m, k = %s)", gam.k))
m <- gam(fm, data = df[vi,])
}
df$res <- -Inf
df$res[vi] <- resid(m,type='response')
n.feat <- ncol(mat) ## gene
n.obs <- nrow(mat) ## cell
df$lp <- as.numeric(pf(exp(df$res),n.obs,n.obs,lower.tail=FALSE,log.p=TRUE)) ## degrees of freedom based on number of cells to focus on cellular differences in expression
df$lpa <- bh.adjust(df$lp,log=TRUE)
df$qv <- as.numeric(qchisq(df$lp, n.feat-1, lower.tail = FALSE,log.p=TRUE)/n.feat) ## we are interested in how variable each feature (gene) is relative to the total number of features
if(use.unadjusted.pvals) {
ods <- which(df$lp<log(alpha))
} else {
ods <- which(df$lpa<log(alpha))
}
if(verbose) {
message(paste0(length(ods), ' overdispersed genes ... ' ))
}
df$gsf <- geneScaleFactors <- sqrt(pmax(min.adjusted.variance,pmin(max.adjusted.variance,df$qv))/exp(df$v))
df$gsf[!is.finite(df$gsf)] <- 0
if(plot) {
if(do.par) {
par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0)
}
graphics::smoothScatter(df$m,df$v,main='',xlab='log[ magnitude ]',ylab='log[ variance ]')
grid <- seq(min(df$m[vi]),max(df$m[vi]),length.out=1000)
graphics::lines(grid,predict(m,newdata=data.frame(m=grid)),col="blue")
if(length(ods)>0) {
graphics::points(df$m[ods],df$v[ods],pch='.',col=2,cex=1)
}
graphics::smoothScatter(df$m[vi],df$qv[vi],xlab='log[ magnitude ]',ylab='',main='adjusted')
graphics::abline(h=1,lty=2,col=8)
if(is.finite(max.adjusted.variance)) { graphics::abline(h=max.adjusted.variance,lty=2,col=1) }
graphics::points(df$m[ods],df$qv[ods],col=2,pch='.')
}
## variance normalize
norm.mat <- counts*df$gsf
if(!details) {
return(colnames(mat)[ods])
} else {
## return normalization factor
return(list(mat=norm.mat, ods=colnames(mat)[ods], df=df))
}
}
fac2col <- function(x,s=1,v=1,shuffle=FALSE,min.group.size=1,return.details=FALSE,unclassified.cell.color='lightgrey',level.colors=NULL) {
x <- as.factor(x)
if(min.group.size>1) {
x <- factor(x,exclude=levels(x)[unlist(tapply(rep(1,length(x)),x,length))<min.group.size])
x <- droplevels(x)
}
if(is.null(level.colors)) {
col <- grDevices::rainbow(length(levels(x)),s=s,v=v)
} else {
col <- level.colors[1:length(levels(x))]
}
names(col) <- levels(x)
if(shuffle) col <- sample(col)
y <- col[as.integer(x)]
names(y) <- names(x)
y[is.na(y)] <- unclassified.cell.color
if(return.details) {
return(list(colors=y,palette=col))
} else {
return(y)
}
}
#' Benjamini-Hochberg p-value adjustment
#'
#' @description Benjamini-Hochberg p-value adjustment
#'
#' @param x vector of p-values
#' @param log log option (default: FALSE)
#'
#' @noRd
bh.adjust <- function(x, log = FALSE) {
nai <- which(!is.na(x))
ox <- x
x <- x[nai]
id <- order(x, decreasing = FALSE)
if(log) {
q <- x[id] + log(length(x)/seq_along(x))
} else {
q <- x[id]*length(x)/seq_along(x)
}
a <- rev(cummin(rev(q)))[order(id)]
ox[nai] <- a
ox
}
#' Normalizes counts to CPM
#'
#' @description Normalizes raw counts to log10 counts per million with pseudocount
#'
#' @param counts Read count matrix. The rows correspond to genes, columns
#' correspond to individual cells
#' @param normFactor Normalization factor such as cell size. If not provided
#' column sum as proxy for library size will be used
#' @param depthScale Depth scaling. Using a million for CPM (default: 1e6)
#' @param pseudo Pseudocount for log transform (default: 1)
#' @param log Whether to apply log transform
#' @param verbose Verbosity (default: TRUE)
#'
#' @return a normalized matrix
#'
#' @examples
#' data(mOB)
#' counts <- normalizeCounts(mOB$counts)
#'
#' @noRd
#'
#'
normalizeCounts <- function (counts,
normFactor = NULL,
depthScale = 1e+06,
pseudo = 1,
log = TRUE,
verbose = TRUE) {
if (!class(counts) %in% c("dgCMatrix", "dgTMatrix")) {
if (verbose) {
message("Converting to sparse matrix ...")
}
counts <- Matrix::Matrix(counts, sparse = TRUE)
}
if (verbose) {
message("Normalizing matrix with ", ncol(counts), " cells and ",
nrow(counts), " genes.")
}
if (is.null(normFactor)) {
if (verbose) {
message("normFactor not provided. Normalizing by library size.")
}
normFactor <- Matrix::colSums(counts)
}
if (verbose) {
message(paste0("Using depthScale ", depthScale))
}
counts <- Matrix::t(Matrix::t(counts)/normFactor)
counts <- counts * depthScale
if (log) {
if (verbose) {
message("Log10 transforming with pseudocount ",
pseudo, ".")
}
counts <- log10(counts + pseudo)
}
return(counts)
}
#' Function to winsorize the gene counts
#'
#' @description Function to winsorize the gene counts
#'
#' @param x gene count matrix
#' @param qt quantile for winsorization (default: 0.05)
#'
#' @return winsorized gene count matrix
#'
#' @importFrom stats quantile
#'
#' @noRd
winsorize <- function (x, qt=.05) {
if(length(qt) != 1 || qt < 0 ||
qt > 0.5) {
stop("bad value for quantile threshold")
}
lim <- quantile(x, probs=c(qt, 1-qt))
x[ x < lim[1] ] <- lim[1]
x[ x > lim[2] ] <- lim[2]
x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.