#' Train binary KDE classifier
#'
#' @param x train data
#' @param y train answers
#' @param H the algorithm of bandwidth selection (see 'ks' documentation) (default=ks::Hns)
#' @param prior.probs Prior probabilities or weights of classes (default=c(0.5, 0.5))
#' @return Trained KDE classifier
#' @export
TrainKDE <- function(x, y, H=ks::Hns, prior.probs=c(0.5, 0.5)) {
if (any(prior.probs < 0)){
stop("Prior probabilities must be positive")
}
y <- as.factor(y)
prior.probs <- Normalize(as.matrix(prior.probs), func=sum)[, 1]
if (length(prior.probs) != 2 || length(levels(y)) != 2) {
stop("Only binary classification is implemented")
}
data0 <- x[y == levels(y)[1],]
data1 <- x[y == levels(y)[2],]
clf <- list(data1=data1, data0=data0, h1=H(data1), h0=H(data0), class.labels=levels(y), prior.probs=prior.probs)
class(clf) <- "KdeClassifier"
return(clf)
}
#' Predict data with the KDE classifier
#'
#' @param clf trained classifier
#' @param x data for classification
#' @param bandwidth.mult numeric Multiplier for KDE bandwidth (default=1)
#' @return Class probabilities
#' @export
PredictKDE <- function(clf, x, bandwidth.mult=1) {
if (class(clf) != "KdeClassifier") {
stop("The classifier provided isn't a KdeClassifier")
}
dens1 <- ks::kde(clf$data1, H=clf$h1 * bandwidth.mult, eval.points = x)$estimate
dens0 <- ks::kde(clf$data0, H=clf$h0 * bandwidth.mult, eval.points = x)$estimate
dens1 <- pmax(dens1, 0) # ks sometimes returns negative values close to zero
dens0 <- pmax(dens0, 0)
sum.probs <- dens1 * clf$prior.probs[2] + dens0 * clf$prior.probs[1]
prob1 <- as.vector(dens1 * clf$prior.probs[2]) / sum.probs
outliers <- which(sum.probs < 1e-10)
if (length(outliers) > 0) {
warning(paste0("Found ", length(outliers), " points with 0 density. Either it's outliers or you should increase bandwidth. Setting score for these points to 0.5."))
prob1[outliers] <- 0.5
}
ans <- cbind(1 - prob1, prob1)
colnames(ans) <- clf$class.labels
rownames(ans) <- rownames(x)
return(ans)
}
#' Train classifier with TrainKDE
#'
#' @param tr.data training data for classification
#' @param cells.quality vector of cell barcodes, named with cells.quality ("High", "Low")
#' @param umi.counts vector with the number of UMIs per cell (default=NULL)
#' @param trim.low.quality.rate numeric Value at which to filter low quality values (default=1.5)
#' @param prior.probs Prior probabilities or weights of classes (default=c(0.5, 0.5))
#' @return Trained KDE classifier
#' @export
TrainClassifier <- function(tr.data, cells.quality, umi.counts=NULL, trim.low.quality.rate = 1.5, prior.probs=c(0.5, 0.5)) {
hq.cbs <- names(cells.quality)[cells.quality == 'High']
lq.cbs <- names(cells.quality)[cells.quality == 'Low']
if (!is.null(trim.low.quality.rate) && !is.null(umi.counts) && length(lq.cbs) > length(hq.cbs) * trim.low.quality.rate) {
lq.cbs <- names(sort(umi.counts[lq.cbs], decreasing=TRUE)[1:round(length(hq.cbs) * trim.low.quality.rate)])
}
tr.data <- tr.data[c(hq.cbs, lq.cbs), , drop=FALSE]
tr.answers <- c(rep(1, length(hq.cbs)), rep(0, length(lq.cbs)))
return(TrainKDE(tr.data, tr.answers, prior.probs=prior.probs))
}
#' Score cells with a KDE classifier
#'
#' @param data data for classification
#' @param cells.quality vector of cell barcodes, named with cells.quality ("High", "Low")
#' @return Probability of cell to be high-quality
#'
#' @export
ScoreCells <- function(data, cells.quality) {
## hq.cbs := vector of cell barcodes, which mostly have high quality
hq.cbs <- names(cells.quality)[cells.quality == 'High']
## lq.cbs := vector of cell barcodes, which mostly have low quality
lq.cbs <- names(cells.quality)[cells.quality == 'Low']
tr.data <- data[c(hq.cbs, lq.cbs),]
tr.answers <- c(rep(1, length(hq.cbs)), rep(0, length(lq.cbs)))
return(PredictKDE(TrainKDE(tr.data, tr.answers), data)[, 2])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.