R/prepare_data.R

Defines functions prepare_data

Documented in prepare_data

#' convert result lists from benchmark() into data frame for plotting
#'
#' @param results.all list of lists, as generated by the 'benchmark()' function
#' @return data frame containing information about every deconvolution result
#' in the dataset
#' @export

prepare_data <- function(results.all) {
    # parameter check
    # however, if results.all is not a list as written to file by 'benchmark'
    # the function will most likely fail
    
    if (!is.list(results.all)) {
        stop(
          "results.all must be a list of lists that were returned
          by deconvolute or one of the simulation functions"
        )
    }

    df <- c()
    # cases sample/geneset/bulk simulation
    if ("bulk.props" %in% names(results.all)) {
    # extract real proportions from lists
      real.props <- results.all$bulk.props
      results.list <- results.all[-which(names(results.all) == "bulk.props")]
      if (length(results.list) == 1) {
        results.list <- results.list[[1]]
      }
      # iterate through the results list
      # depth of the list depends on the benchmark
      for (i in 1:length(results.list)) {
        results <- results.list[[i]]
        for(res in results) {
          # if the list has three levels the top level represents the geneset
          # or amount of samples used
          # if res contains lists, it is either geneset or sample benchmark
            if (is.list(res[[1]])) {
              for(r in res) {
                scores <- c()
                errors <- c()
                name <- r$name
                time <- as.numeric(r$times)

                # try to determine what information is present in the lists
                if (any(is.na(as.numeric(names(results.list)[i])))) {
                    geneset <- names(results.list)[i]
                    fraction <- 100
                }else{
                    geneset <- NA
                    fraction <- as.numeric(names(results.list)[i])
                }

                # calculate condition number if reference is available
                if (!is.null(r$sig.matrix)) {
                    cond.num <- kappa(r$sig.matrix, exact = TRUE)
                }else{
                    cond.num <- NA
                }

                # if the deconvolution worked evaluate with correlation
                if (!all(is.null(r$est.props)) && !all(is.na(r$est.props))) {
                    # performance per cell type
                    for (t in intersect(
                      rownames(r$est.props),
                      rownames(real.props))
                    ) {
                      temp.score <- bootstrap_results(real.props[t, ], r$est.props[t, ])
                      if (is.null(temp.score)) {
                        next
                      }
                      
                      scores <- c(scores, temp.score$mean_cor)
                      errors <- c(errors, temp.score$error)
                      df <- rbind(
                        df,
                        c(name, temp.score$mean_cor, temp.score$error, t, geneset,
                          time, fraction, cond.num)
                      )
                    }
                    # overall performance
                    # (average over per-cell-type-performance);
                    # store as cell type "overall"
                    if (!is.null(scores)) {
                      if (!all(is.na(scores))) {
                        avg_score <- mean(scores[!is.na(scores)])
                      } else {
                        avg_score <- NA
                      }
                      if (!all(is.na(errors))) {
                        avg_error <- mean(errors[!is.na(errors)])
                      } else {
                        avg_error <- NA
                      }
                      df <- rbind(
                        df,
                        c(name, avg_score, avg_error, "overall", geneset,
                          time, fraction, cond.num
                        )
                      )
                    }
                }else{
                    df <- rbind(
                      df,
                      c(name, NA, NA, "overall", geneset,
                        time, fraction, cond.num
                      )
                    )
                }
              }
            }else{
                  # this is executed if the list has two levels (bulk benchmark)
                  # this is actually the same as above,
                  # only with fixed values for fraction and gene set
                  scores <- c()
                  errors <- c()
                  r <- res
                  name <- r$name
                  time <- as.numeric(r$times)
                  # calculate condition number if reference is available
                  if (!is.null(r$sig.matrix)) {
                      cond.num <- kappa(r$sig.matrix, exact = T)
                  }else{
                      cond.num <- NA
                  }
                  # if the deconvolution worked, evaluate with correlation
                  if (!all(is.null(r$est.props)) && !all(is.na(r$est.props))) {
                    # performance per cell type
                    common_ct <- intersect(
                      rownames(r$est.props),
                      rownames(real.props)
                    )
                    # include only datasets with at least one cell type
                    if (length(common_ct) > 0) {
                      for (t in rownames(real.props)) {
                        if (t %in% rownames(r$est.props)) {
                          temp.score <- bootstrap_results(
                            real.props[t,],
                            r$est.props[t,]
                          )
                          if (is.null(temp.score)) {
                            next
                          }
                          df <- rbind(
                            df,
                            c(name, temp.score$mean_cor, temp.score$error, t, NA,
                              time, 100, cond.num)
                          )
                        }else{
                          temp.score <- list(mean_cor = NA, error = NA)
                        }
                        scores <- c(scores, temp.score$mean_cor)
                        errors <- c(errors, temp.score$error)
                      }
                      # overall performance
                      if (!all(is.na(scores))) {
                        avg_score <- mean(scores[!is.na(scores)])
                      } else {
                        avg_score <- NA
                      }
                      if (!all(is.na(errors))) {
                        avg_error <- mean(errors[!is.na(errors)])
                      } else {
                        avg_error <- NA
                      }
                      df <- rbind(
                        df,
                        c(name, avg_score, avg_error, "overall", NA,
                          time, 100, cond.num)
                      )
                    }
                  }else{
                    df <- rbind(
                      df,
                      c(name, NA, NA, "overall", NA,
                        time, 100, cond.num)
                    )
                  }
            }
        }
      }
      df <- as.data.frame(df)
      # if a column is missing, something was wrong with the data
      if (ncol(df) != 8) {
        return(data.frame())
      }
      colnames(df) <- c(
        "algorithm", "score", "error", "cell_type",
        "geneset", "time",
        "fraction", "condition_number"
      )
      df$score <- as.numeric(as.character(df$score))
      df$error <- as.numeric(as.character(df$error))
      df$time <- as.numeric(as.character(df$time))
      df$condition_number <- as.numeric(as.character(df$condition_number))
      df$fraction <- as.numeric(as.character(df$fraction))
    }else{
      if (all(grepl("subtype.frac", names(results.all), fixed = TRUE))) {
        # case subtype benchmark
        for (subtype.column in names(results.all)) {
          results.list <- results.all[[subtype.column]]
          avg.cells <- as.numeric(
            strsplit(subtype.column, ".", fixed = TRUE)[[1]][3]
          )
          c.true <- results.list[["c.true"]]
          c.true.coarse <- results.list[["c.true.coarsly"]]
          
          for (j in seq_len(length(results.list) - 2)) {
            rep_name <- paste0("rep", j)
            c.est.list <- results.list[[rep_name]][["c.estimated.list"]]
            c.est.list.coarse <- results.list[[rep_name]][[
              "c.estimated.coarsly.list"
            ]]

            # fine cell types
            for (a in names(c.est.list)) {
                a.est <- c.est.list[[a]]
                coarse <- FALSE
                scores <- c()
                errors <- c()
                if (!is.null(a.est)) {
                    for (t in intersect(rownames(a.est), rownames(c.true))) {
                        temp.score <- bootstrap_results(c.true[t, ], a.est[t, ])
                        if(is.null(temp.score)) {
                          next
                        }
                        scores <- c(scores, temp.score$mean_cor)
                        errors <- c(errors, temp.score$error)
                        df <- rbind(
                          df,
                          c(a, temp.score$mean_cor, temp.score$error, t, NA, 
                            NA, NA, NA, coarse, avg.cells)
                        )
                    }
                    if(!is.null(scores)){
                      if (!all(is.na(scores))) {
                        avg_score <- mean(scores[!is.na(scores)])
                      } else {
                        avg_score <- NA
                      }
                      if (!all(is.na(errors))) {
                        avg_error <- mean(errors[!is.na(errors)])
                      } else {
                        avg_error <- NA
                      }
                    }
                    df <- rbind(
                      df,
                      c(a, avg_score, avg_error, "overall", NA,
                        NA, NA, NA, coarse, avg.cells)
                    )
                }else{
                  df <- rbind(
                    df,
                    c(a, NA, NA, "overall", NA,
                      NA, NA, NA, coarse, avg.cells)
                  )
                }
            }
            # aggregated cell types (coarse)
            for (a in names(c.est.list.coarse)) {
              a.est <- c.est.list.coarse[[a]]
              coarse <- TRUE
              scores <- c()
              errors <- c()
              if (!is.null(a.est)) {
                for (t in intersect(rownames(a.est), rownames(c.true.coarse))) {
                  temp.score <- bootstrap_results(c.true.coarse[t, ], a.est[t, ])
                  if (is.null(temp.score)) {
                    next
                  }
                  scores <- c(scores, temp.score$mean_cor)
                  errors <- c(errors, temp.score$error)
                  df <- rbind(
                    df,
                    c(a, temp.score$mean_cor, temp.score$error, t, NA,
                      NA, NA, NA, coarse, avg.cells)
                  )
                }
                if (!is.null(scores)) {
                  if (!all(is.na(scores))) {
                    avg_score <- mean(scores[!is.na(scores)])
                  } else {
                    avg_score <- NA
                  }
                  if (!all(is.na(errors))) {
                    avg_error <- mean(errors[!is.na(errors)])
                  } else {
                    avg_error <- NA
                  }
                  df <- rbind(
                    df,
                    c(a, avg_score, avg_error, "overall", NA,
                       NA, NA, NA, coarse, avg.cells)
                  )
                }
              }else{
                df <- rbind(
                  df,
                  c(a, NA, NA, "overall", NA,
                    NA, NA, NA, coarse, avg.cells)
                )
              }
            }
          }
        }

        df <- as.data.frame(df)
        # if a column is missing, something was wrong with the data
        if (ncol(df) != 10) {
          return(data.frame())
        }
        colnames(df) <- c(
          "algorithm", "score", "error", "cell_type",
          "geneset", "time", "fraction",
          "condition_number", "coarse", "cluster_size"
        )
        df$score <- as.numeric(as.character(df$score))
        df$error <- as.numeric(as.character(df$error))
        df$time <- as.numeric(as.character(df$time))
        df$condition_number <- as.numeric(as.character(df$condition_number))
        df$fraction <- as.numeric(as.character(df$fraction))
        df$coarse <- as.logical(as.character(df$coarse))
        df$cluster_size <- as.numeric(as.character(df$cluster_size))
    }
  }
    saveRDS(df, "prepared_data.rds")
  return(df)
}
MarianSchoen/DMC documentation built on Aug. 2, 2022, 3:05 p.m.