#' create lineplots of deconvolution results and runtime for different gene sets
#'
#' @param results.df data frame as returned by prepare_data
#' @param genesets list of gene sets (character vectors)
#' @param available.features character vector containing names
#' of available features
#' @param celltype.order character vector of cell types
#' specifying the plotting order
#' @param algorithm.order character vector of algorithm names
#' specifying the plotting order
#' @return list containing two elements:\cr
#' 1) runtime.plot\cr
#' 2) list containing lineplot of scores for each celltypes\cr
create_lineplots <- function(
results.df,
genesets = NULL,
available.features = NULL,
celltype.order = NULL,
algorithm.order = NULL
) {
# parameter checks
if (!is.data.frame(results.df)) {
stop("results.df must be a data frame")
}
required_cols <- c(
"algorithm", "score","geneset", "cell_type", "time"
)
if (!all(required_cols %in% colnames(results.df))) {
stop("required columns missing from results.df")
}
if (!is.null(genesets)) {
if (!is.list(genesets)) {
stop("genesets must be a named list of character vectors")
}
}
if (!is.null(available.features)) {
if (!is.character(available.features)) {
stop("available features must be a charcter vector")
}
}
if (!is.null(celltype.order)) {
if (!is.character(celltype.order)) {
stop("celltype.order must be a charcter vector")
}
if (!all(celltype.order %in% unique(results.df$cell_type)) ||
length(celltype.order) != length(unique(results.df$cell_type))) {
stop("celltype.order does not fit the cell_type column of results.df")
}
}
if (!is.null(algorithm.order)) {
if (!is.character(algorithm.order)) {
stop("celltype.order must be a charcter vector")
}
if (!all(algorithm.order %in% unique(results.df$algorithm)) ||
length(algorithm.order) != length(unique(results.df$algorithm))) {
stop("algorithm.order does not fit the algorithm column of results.df")
}
}
overall.df <- results.df[which(results.df$cell_type == "overall"), ]
# order algorithms by performance or given order
if (is.null(algorithm.order)) {
performances <- tapply(overall.df$score, overall.df$algorithm, median)
results.df$algorithm <- factor(
results.df$algorithm,
levels = levels(overall.df$algorithm)[order(performances)]
)
}else{
results.df$algorithm <- factor(
results.df$algorithm,
levels = algorithm.order
)
}
if (!is.null(celltype.order)) {
results.df$cell_type <- factor(
results.df$cell_type,
levels = celltype.order
)
}
if (any(is.na(results.df$geneset))) {
geneset_vec <- as.character(results.df$geneset)
geneset_vec[is.na(geneset_vec)] <- "default"
results.df$geneset <- as.factor(geneset_vec)
}
# create display labels for gene sets and
# sort according to number of genes if possible
if (!is.null(genesets)) {
if (all(unique(results.df$geneset) %in% names(genesets))) {
geneset.labs <- paste(
names(genesets),
"\n(",
as.numeric(
sapply(genesets, function(x) length(which(x %in% available.features)))
),
" genes)",
sep = ""
)
}else{
geneset.labs <- levels(results.df$geneset)
}
geneset.sizes <- sapply(
genesets,
function(x) length(which(x %in% available.features))
)
geneset.labs <- geneset.labs[order(geneset.sizes)]
geneset.limits <- names(sort(geneset.sizes))
}else{
geneset.labs <- levels(results.df$geneset)
geneset.limits <- levels(results.df$geneset)
}
# create plot per cell type (including overall)
cell.type.plots <- list()
for (t in levels(results.df$cell_type)) {
sub.df <- results.df[which(results.df$cell_type == t), ]
# create data frame containing score for each
# algorithm and gene set for this cell type
temp.scores <- tapply(
sub.df$score,
list(sub.df$algorithm, sub.df$geneset),
mean
)
temp.sds <- tapply(
sub.df$score,
list(sub.df$algorithm, sub.df$geneset),
sd
)
temp.times <- tapply(
as.numeric(as.character(sub.df$time)),
list(sub.df$algorithm, sub.df$geneset),
mean
)
temp.df <- data.frame()
for (i in seq_len(ncol(temp.scores))) {
for (j in seq_len(nrow(temp.scores))) {
temp.df <- rbind(
temp.df,
data.frame(
algorithm = rownames(temp.scores)[j],
geneset = colnames(temp.scores)[i],
score = temp.scores[j, i],
time = temp.times[j,i],
sd = temp.sds[j, i]
)
)
}
}
if (nrow(temp.df) == 0)
next
temp.df$geneset <- factor(temp.df$geneset, levels = geneset.limits)
cell.type.plots[[t]] <- ggplot(
temp.df,
aes(x=geneset, y = score, group = algorithm, col = algorithm)
) +
geom_line(size = 2) +
geom_point() +
geom_errorbar(
aes(x = geneset, ymin = score - sd, ymax = score + sd),
width = 0.2
) +
xlab("gene set (increasing size)") +
ylab("correlation") +
ggtitle(paste(
"deconvolution quality using different gene sets (", t, ")", sep = ""
)) +
scale_x_discrete(limits = geneset.limits, labels = geneset.labs) +
guides(linetype = guide_legend(override.aes = list(size = 2))) +
theme(
legend.text = element_text(size = 20),
legend.title = element_text(size = 22),
title = element_text(size = 24),
axis.title.x = element_text(size = 22),
axis.text.x = element_text(size = 20),
axis.title.y = element_text(size = 22),
axis.text.y = element_text(size = 20)
)
cell.type.plots[[t]] <- cell.type.plots[[t]] + ylim(0, 1)
# create only one runtime plot
if (t == "overall") {
runtime.plot <- ggplot(
temp.df,
aes(x = geneset, y = log(time, 10), group = algorithm, col = algorithm)
) +
geom_line(size = 2) +
geom_point() +
xlab("gene set") +
ylab("log time (s)") +
ggtitle("runtime of algorithms using different gene sets") +
theme(
legend.text = element_text(size = 20),
legend.title = element_text(size = 22),
title = element_text(size = 24),
axis.title.x = element_text(size = 22),
axis.text.x = element_text(size = 20),
axis.title.y = element_text(size = 22),
axis.text.y = element_text(size = 20)
) +
scale_x_discrete(limits = geneset.limits, labels = geneset.labs) +
guides(linetype = guide_legend(override.aes = list(size = 2)))
}
}
return(list(
runtime.plot = runtime.plot,
cell.type.plots = cell.type.plots
))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.