#'
#' Calculate most variable genes for SingleCellExperiment object
#'
#' @importFrom matrixStats rowVars
#'
#' @return sce with top marker_num most variable genes
#'
#' @keywords internal
variableGenes <- function(sce, marker_num) {
if (dim(sce)[1] > marker_num) {
vars <- rowVars(as.matrix(logcounts(sce)))
genes <- rownames(sce)
df <- data.frame(genes, vars, stringsAsFactors = FALSE)
df <- df[order(df$vars, decreasing = TRUE),][seq_len(marker_num), ]
return(sce[df$genes, ])
}
else {
return(sce)
}
}
#'
#' Splits SingleCellExperiment object into test and training data
#'
#' @importFrom matrixStats rowVars
#'
#' @return a list, including both the training and test splits
#'
#' @keywords internal
sampleSplit <- function(sce, marker_num = 2000) {
if (dim(sce)[1] > marker_num) {
sce <- variableGenes(sce, marker_num)
}
trainingSize <- ceiling(0.5 * length(colnames(sce)))
training <- sce[, sample(colnames(sce), trainingSize)]
testing <- sce[, setdiff(colnames(sce), colnames(training))]
return(list(training, testing))
}
#' calculate roc objects from a casc object, one for each cluster
#'
#' @param truths List of original class labels
#' @param response matrix or data.frame of probabilities of each class
#' @return A list of roc objects
#' @importFrom pROC roc
multROC <- function(truths, response) {
truths <- as.numeric(as.factor(truths))
if (length(levels(as.factor(truths))) > 2) {
roc_l <- lapply(seq_along(levels(as.factor(truths))), function(i){
single_t <- ifelse(truths == i, 1, 0)
pROC::roc(single_t, response[, i])
})
}
else {
roc_l <- list()
roc_l[1] <- list(pROC::roc(truths, response[, 1]))
}
roc_l
}
#'
#' Sample, Train, and Predict logistic regression model
#' using singleCellExperiment and glmnet, one object at a time
#'
#' @return a casc object
#'
#' @importFrom caret trainControl train predict.train upSample
#' @importFrom magrittr "%>%"
#' @keywords internal
single_casc <- function(cluster_name, dataSplits, alpha = 0.5, cv) {
train_y <- colData(dataSplits[[1]])[[cluster_name]] %>%
as.factor()
original_levels <- levels(train_y)
levels(train_y) <- do.call(paste0,
replicate(7, sample(LETTERS, length(levels(train_y)), TRUE),
FALSE)) %>%
sort()
train_dat <- logcounts(dataSplits[[1]])[,!is.na(train_y)]
train_y <- train_y[!is.na(train_y)]
up_samps <- upSample(t(train_dat), train_y)
train_x <- subset(up_samps, select=-c(Class))
trcntrl <- trainControl(
method = "cv",
number = cv,
returnResamp = "all",
classProbs = TRUE,
savePredictions = TRUE
)
fit <- train(
x = train_x,
y = up_samps$Class,
method = "glmnet",
trControl = trcntrl,
metric = "Accuracy",
tuneGrid = expand.grid(alpha = alpha,
lambda = seq(0.001, 0.1, by = 0.001))
)
test_y <- colData(dataSplits[[2]])[[cluster_name]]
test_dat <- logcounts(dataSplits[[2]])
test_dat <- test_dat[,!is.na(test_y)]
test_y <- test_y[!is.na(test_y)]
probs <-
predict.train(fit, newdata = t(test_dat),
type = "prob")
classes <-predict(fit, newdata = t(test_dat),
type = "raw") %>%
as.numeric() %>%
as.factor()
roc_l <- multROC(test_y, probs)
auc <- avgAUC(roc_l)
levels(classes) <- original_levels
colnames(probs) <- original_levels
obj <- list(
predicted_classes = classes,
auc = auc,
response = probs,
truths = as.factor(test_y)
)
class(obj) <- "casc"
obj
}
#' Sample, Train, and Predict logistic regression
#' model using singleCellExperiment and glmnet.
#'
#' @param sce a singleCellExperiment
#' @param clusters A list of clusters, an array or list of
#' integers of same length as number of cells in sce.
#' @param marker_num The top variable genes that
#' will be used to filter the SCE by to reduce runtime.
#' @param alpha A parameter for logistic regression where 0
#' is `ridge regression` and 1 is `lasso regression`.
#' @param cv The number of folds for cross validation
#'
#' @return A list of casc objects with predicted classes,
#' aucs, responses, and truths.
#'
#' @examples
#' library(SingleCellExperiment)
#'
#' counts <- matrix(rnorm(40000, 10, 10), ncol=200, nrow=200)
#' sce <- SingleCellExperiment(assays = list(logcounts = counts))
#' colnames(sce) <- stringi::stri_rand_strings(200, 5)
#' rownames(sce) <- stringi::stri_rand_strings(200, 5)
#'
#' cluster_1 <- rep(c(0, 1, 1, 1, 1, 0, 1, 1, 1, 1), 20)
#' cluster_2 <- rep(c(0, 1, 1, 0, 1, 0, 1, 1, 0, 1), 20)
#'
#' casc_list <- casc(sce, list(cluster_1, cluster_2), marker_num=1500)
#'
#'
#'
#' @import SingleCellExperiment
#'
#' @importFrom magrittr "%>%"
#' @export
casc <- function(sce,
clusters,
marker_num = 2000,
alpha = 0.5,
cv = 5) {
if(is.matrix(sce)){
sce <- SingleCellExperiment(assays = list(logcounts = sce))
}
logcounts(sce) <- as.matrix(logcounts(sce))
l <- lapply(seq_along(clusters), function(c) {
cluster_name <- paste0("casc_", c)
cluster_name
})
for(c in seq_along(l)){
colData(sce)[l[[c]]] <- clusters[[c]]
}
dataSplits <- sampleSplit(sce, marker_num = marker_num)
cascs <- lapply(l, single_casc, dataSplits = dataSplits, alpha = alpha, cv = cv)
names(cascs) <- l
class(cascs) <- "casc_list"
cascs
}
#' Print a \code{casc}
#'
#' @param x An object of class \code{casc}
#' @param ... Additional arguments (unused)
#'
#'
#' @return A string with available options
#' @import glue
#
#' @export
print.casc <- function(x, ...) {
N <- dim(x$response)[2]
cat(
glue::glue(
"A casc object for {N} provided classes
",
"Call x$predicted_classes to access predictions of class
",
"Call x$auc to access area under the curve
",
"Call x$response to access prediction response
",
"Call x$truths to access original classes\n\n"
)
)
}
#' Plot multiple roc objects on the same ggplot
#'
#' @param casc A casc object
#'
#' @return A ggplot object with ROC curves plotted for each cluster
#' @importFrom pROC ggroc
#' @importFrom ggplot2 theme element_blank
#'
#' @examples
#' library(SingleCellExperiment)
#'
#' counts <- matrix(rnorm(40000, 10, 10), ncol=200, nrow=200)
#' sce <- SingleCellExperiment(assays = list(logcounts = counts))
#' colnames(sce) <- stringi::stri_rand_strings(200, 5)
#' rownames(sce) <- stringi::stri_rand_strings(200, 5)
#'
#' cluster_1 <- rep(c(0, 1, 1, 1, 1, 0, 1, 1, 1, 1), 20)
#' cluster_2 <- rep(c(0, 1, 1, 0, 1, 0, 1, 1, 0, 1), 20)
#'
#' casc_list <- casc(sce, list(cluster_1, cluster_2), marker_num=1500)
#' multROCPlot(casc_list$casc_1)
#'
#' @export
multROCPlot <- function(casc) {
roc_l <- multROC(truths = casc$truths, response = casc$response)
pROC::ggroc(roc_l) + theme(legend.title = element_blank())
}
#' Sample, Train, and Predict logistic regression
#' model using singleCellExperiment and glmnet
#'
#' @param casc_list A casc_list object produced by casc
#'
#' @return A ggplot scatterplot with AUCs plotted for each cluster
#' @importFrom ggplot2 ggplot geom_point theme_light
#' @importFrom magrittr "%<>%"
#' @importFrom ggplot2 aes
#'
#' @examples
#' library(SingleCellExperiment)
#'
#' counts <- matrix(rnorm(40000, 10, 10), ncol=200, nrow=200)
#' sce <- SingleCellExperiment(assays = list(logcounts = counts))
#' colnames(sce) <- stringi::stri_rand_strings(200, 5)
#' rownames(sce) <- stringi::stri_rand_strings(200, 5)
#'
#' cluster_1 <- rep(c(0, 1, 1, 1, 1, 0, 1, 1, 1, 1), 20)
#' cluster_2 <- rep(c(0, 1, 1, 0, 1, 0, 1, 1, 0, 1), 20)
#'
#' casc_list <- casc(sce, list(cluster_1, cluster_2), marker_num=1500)
#' aucPlot(casc_list)
#'
#' @export
aucPlot <- function(casc_list) {
df <- t(as.data.frame(lapply(casc_list, function(x) {
x$auc
})))
k <- t(as.data.frame(lapply(casc_list, function(x) {
length(x[[3]])
})))
df <- as.data.frame(cbind(df, k))
colnames(df) <- c("AUC", "K")
df$K <- as.factor(df$K)
ggplot(df, aes(x = K, y = AUC)) + geom_point() + theme_light()
}
#' Finds the average AUC from a list of roc objects
#'
#' @param roc_l A list of casc objects produced by casc
#' @importFrom pROC auc
#' @importFrom magrittr "%>%"
#' @return Average auc for the list, numeric
#'
avgAUC <- function(roc_l) {
auc_l <- lapply(roc_l, pROC::auc) %>%
lapply(as.numeric)
mean(unlist(auc_l))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.