#' convert result lists from benchmark() into data frame for plotting
#'
#' @param results.all list of lists, as generated by the 'benchmark()' function
#' @return data frame containing information about every deconvolution result
#' in the dataset
#' @export
prepare_data <- function(results.all) {
# parameter check
# however, if results.all is not a list as written to file by 'benchmark'
# the function will most likely fail
if (!is.list(results.all)) {
stop(
"results.all must be a list of lists that were returned
by deconvolute or one of the simulation functions"
)
}
df <- c()
# cases sample/geneset/bulk simulation
if ("bulk.props" %in% names(results.all)) {
# extract real proportions from lists
real.props <- results.all$bulk.props
results.list <- results.all[-which(names(results.all) == "bulk.props")]
if (length(results.list) == 1) {
results.list <- results.list[[1]]
}
# iterate through the results list
# depth of the list depends on the benchmark
for (i in 1:length(results.list)) {
results <- results.list[[i]]
for(res in results) {
# if the list has three levels the top level represents the geneset
# or amount of samples used
# if res contains lists, it is either geneset or sample benchmark
if (is.list(res[[1]])) {
for(r in res) {
scores <- c()
errors <- c()
name <- r$name
time <- as.numeric(r$times)
# try to determine what information is present in the lists
if (any(is.na(as.numeric(names(results.list)[i])))) {
geneset <- names(results.list)[i]
fraction <- 100
}else{
geneset <- NA
fraction <- as.numeric(names(results.list)[i])
}
# calculate condition number if reference is available
if (!is.null(r$sig.matrix)) {
cond.num <- kappa(r$sig.matrix, exact = TRUE)
}else{
cond.num <- NA
}
# if the deconvolution worked evaluate with correlation
if (!all(is.null(r$est.props)) && !all(is.na(r$est.props))) {
# performance per cell type
for (t in intersect(
rownames(r$est.props),
rownames(real.props))
) {
temp.score <- bootstrap_results(real.props[t, ], r$est.props[t, ])
if (is.null(temp.score)) {
next
}
scores <- c(scores, temp.score$mean_cor)
errors <- c(errors, temp.score$error)
df <- rbind(
df,
c(name, temp.score$mean_cor, temp.score$error, t, geneset,
time, fraction, cond.num)
)
}
# overall performance
# (average over per-cell-type-performance);
# store as cell type "overall"
if (!is.null(scores)) {
if (!all(is.na(scores))) {
avg_score <- mean(scores[!is.na(scores)])
} else {
avg_score <- NA
}
if (!all(is.na(errors))) {
avg_error <- mean(errors[!is.na(errors)])
} else {
avg_error <- NA
}
df <- rbind(
df,
c(name, avg_score, avg_error, "overall", geneset,
time, fraction, cond.num
)
)
}
}else{
df <- rbind(
df,
c(name, NA, NA, "overall", geneset,
time, fraction, cond.num
)
)
}
}
}else{
# this is executed if the list has two levels (bulk benchmark)
# this is actually the same as above,
# only with fixed values for fraction and gene set
scores <- c()
errors <- c()
r <- res
name <- r$name
time <- as.numeric(r$times)
# calculate condition number if reference is available
if (!is.null(r$sig.matrix)) {
cond.num <- kappa(r$sig.matrix, exact = T)
}else{
cond.num <- NA
}
# if the deconvolution worked, evaluate with correlation
if (!all(is.null(r$est.props)) && !all(is.na(r$est.props))) {
# performance per cell type
common_ct <- intersect(
rownames(r$est.props),
rownames(real.props)
)
# include only datasets with at least one cell type
if (length(common_ct) > 0) {
for (t in rownames(real.props)) {
if (t %in% rownames(r$est.props)) {
temp.score <- bootstrap_results(
real.props[t,],
r$est.props[t,]
)
if (is.null(temp.score)) {
next
}
df <- rbind(
df,
c(name, temp.score$mean_cor, temp.score$error, t, NA,
time, 100, cond.num)
)
}else{
temp.score <- list(mean_cor = NA, error = NA)
}
scores <- c(scores, temp.score$mean_cor)
errors <- c(errors, temp.score$error)
}
# overall performance
if (!all(is.na(scores))) {
avg_score <- mean(scores[!is.na(scores)])
} else {
avg_score <- NA
}
if (!all(is.na(errors))) {
avg_error <- mean(errors[!is.na(errors)])
} else {
avg_error <- NA
}
df <- rbind(
df,
c(name, avg_score, avg_error, "overall", NA,
time, 100, cond.num)
)
}
}else{
df <- rbind(
df,
c(name, NA, NA, "overall", NA,
time, 100, cond.num)
)
}
}
}
}
df <- as.data.frame(df)
# if a column is missing, something was wrong with the data
if (ncol(df) != 8) {
return(data.frame())
}
colnames(df) <- c(
"algorithm", "score", "error", "cell_type",
"geneset", "time",
"fraction", "condition_number"
)
df$score <- as.numeric(as.character(df$score))
df$error <- as.numeric(as.character(df$error))
df$time <- as.numeric(as.character(df$time))
df$condition_number <- as.numeric(as.character(df$condition_number))
df$fraction <- as.numeric(as.character(df$fraction))
}else{
if (all(grepl("subtype.frac", names(results.all), fixed = TRUE))) {
# case subtype benchmark
for (subtype.column in names(results.all)) {
results.list <- results.all[[subtype.column]]
avg.cells <- as.numeric(
strsplit(subtype.column, ".", fixed = TRUE)[[1]][3]
)
c.true <- results.list[["c.true"]]
c.true.coarse <- results.list[["c.true.coarsly"]]
for (j in seq_len(length(results.list) - 2)) {
rep_name <- paste0("rep", j)
c.est.list <- results.list[[rep_name]][["c.estimated.list"]]
c.est.list.coarse <- results.list[[rep_name]][[
"c.estimated.coarsly.list"
]]
# fine cell types
for (a in names(c.est.list)) {
a.est <- c.est.list[[a]]
coarse <- FALSE
scores <- c()
errors <- c()
if (!is.null(a.est)) {
for (t in intersect(rownames(a.est), rownames(c.true))) {
temp.score <- bootstrap_results(c.true[t, ], a.est[t, ])
if(is.null(temp.score)) {
next
}
scores <- c(scores, temp.score$mean_cor)
errors <- c(errors, temp.score$error)
df <- rbind(
df,
c(a, temp.score$mean_cor, temp.score$error, t, NA,
NA, NA, NA, coarse, avg.cells)
)
}
if(!is.null(scores)){
if (!all(is.na(scores))) {
avg_score <- mean(scores[!is.na(scores)])
} else {
avg_score <- NA
}
if (!all(is.na(errors))) {
avg_error <- mean(errors[!is.na(errors)])
} else {
avg_error <- NA
}
}
df <- rbind(
df,
c(a, avg_score, avg_error, "overall", NA,
NA, NA, NA, coarse, avg.cells)
)
}else{
df <- rbind(
df,
c(a, NA, NA, "overall", NA,
NA, NA, NA, coarse, avg.cells)
)
}
}
# aggregated cell types (coarse)
for (a in names(c.est.list.coarse)) {
a.est <- c.est.list.coarse[[a]]
coarse <- TRUE
scores <- c()
errors <- c()
if (!is.null(a.est)) {
for (t in intersect(rownames(a.est), rownames(c.true.coarse))) {
temp.score <- bootstrap_results(c.true.coarse[t, ], a.est[t, ])
if (is.null(temp.score)) {
next
}
scores <- c(scores, temp.score$mean_cor)
errors <- c(errors, temp.score$error)
df <- rbind(
df,
c(a, temp.score$mean_cor, temp.score$error, t, NA,
NA, NA, NA, coarse, avg.cells)
)
}
if (!is.null(scores)) {
if (!all(is.na(scores))) {
avg_score <- mean(scores[!is.na(scores)])
} else {
avg_score <- NA
}
if (!all(is.na(errors))) {
avg_error <- mean(errors[!is.na(errors)])
} else {
avg_error <- NA
}
df <- rbind(
df,
c(a, avg_score, avg_error, "overall", NA,
NA, NA, NA, coarse, avg.cells)
)
}
}else{
df <- rbind(
df,
c(a, NA, NA, "overall", NA,
NA, NA, NA, coarse, avg.cells)
)
}
}
}
}
df <- as.data.frame(df)
# if a column is missing, something was wrong with the data
if (ncol(df) != 10) {
return(data.frame())
}
colnames(df) <- c(
"algorithm", "score", "error", "cell_type",
"geneset", "time", "fraction",
"condition_number", "coarse", "cluster_size"
)
df$score <- as.numeric(as.character(df$score))
df$error <- as.numeric(as.character(df$error))
df$time <- as.numeric(as.character(df$time))
df$condition_number <- as.numeric(as.character(df$condition_number))
df$fraction <- as.numeric(as.character(df$fraction))
df$coarse <- as.logical(as.character(df$coarse))
df$cluster_size <- as.numeric(as.character(df$cluster_size))
}
}
saveRDS(df, "prepared_data.rds")
return(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.