R/assess_quality.R

Defines functions assess_quality minmax outlier find_AUC

Documented in assess_quality

find_AUC <- function(scores) {
  # Local scores object
  scores <- scores
  # Find the number of iterations
  num_iter <- length(scores[[1]])
  # Get the vector of k-values
  k_vec <- names(scores[[1]][[1]])
  # Get the GT gene set labels
  gt_sets <- names(scores)

  # The table containing the AUC values
  AUC_tbl <- list()
  gt_cnt <- 0

  for(gt_set in gt_sets) {
    gt_cnt <- gt_cnt + 1
    all_AUC_values <- c()
    # Gather the gt AUC values
    for(itr in seq_len(num_iter)) {
      all_AUC_values <- cbind(all_AUC_values,
        vapply(k_vec, function(k) {
        pred <- ROCR::prediction(scores[[gt_set]][[itr]][[k]]$prob_vec,
                           scores[[gt_set]][[itr]][[k]]$pos_vec)
        return(ROCR::performance(pred, "auc")@y.values[[1]])}, numeric(1)))
    }
    AUC_tbl[[gt_cnt]] <- all_AUC_values
  }
  names(AUC_tbl) <- gt_sets

  ggp_tbl <- data.frame()
  k_vec <- as.factor(as.numeric(k_vec))
  for(l in seq_len(length(AUC_tbl))) {
    tmp_tbl <- data.frame(rep(k_vec, num_iter),
                          rep(gt_sets[[l]], length(k_vec)*num_iter),
                          as.vector(AUC_tbl[[l]]))
    colnames(tmp_tbl) <- c("k", "Target_Property", "AUC_Value")
    ggp_tbl <- rbind(ggp_tbl, tmp_tbl)
  }

  list(ggp_tbl, k_vec)
}

# Make the ggplot2 boxplot display outliers and min to max values
outlier <- function(x) {
  subset(x, x == max(x) | x == min(x))
}

minmax <- function(x) {
  r <- stats::quantile(x, probs = c(0.00, 0.25, 0.5, 0.75, 1))
  names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
  r
}

#' Generate the cluster quality figure.
#'
#' This function takes the GECO scores table generated by the score_clusters
#' function and finds the ROC/AUC values for the scores. These scores are then
#' arranged into a dataframe for display with ggplot2.
#'
#' @param GECO_scores The scores dataframe output by score_clusters.
#' @return A ggplot2 object ready for plotting e.g. plot(returned_object)
#' @examples
#' # Create pseudo scores tables: >1 required
#' df10 <- data.frame(clust_num = sample(1:10, 200, replace = TRUE), gene_name = sample(paste0(rep("Gene.", 200), seq(1:200)), 200, replace = FALSE), pos_vec = sample(c(rep("FALSE",197), rep("TRUE",3)), 200, replace = FALSE), prob_vec = sample(runif(10), 200, replace = TRUE))
#' df12 <- data.frame(clust_num = sample(1:10, 200, replace = TRUE), gene_name = sample(paste0(rep("Gene.", 200), seq(1:200)), 200, replace = FALSE), pos_vec = sample(c(rep("FALSE",197), rep("TRUE",3)), 200, replace = FALSE), prob_vec = sample(runif(10), 200, replace = TRUE))
#' # Create the scores structure and insert our tables
#' scores <- list()
#' scores$`Ground Truth Set X`$`Iteration 1`$`10` <- df10
#' scores$`Ground Truth Set X`$`Iteration 1`$`12` <- df12
#' fig <- assess_quality(scores)
#' plot(fig)
#' @export
#' @importFrom ggplot2 ggplot aes stat_summary position_dodge stat_boxplot ylim ylab xlab labs theme_bw ggtitle geom_vline theme element_blank element_text rel element_line element_rect
assess_quality <- function(GECO_scores) {
  AUC_list <- find_AUC(GECO_scores)
  AUC_tbl <- AUC_list[[1]]
  k_vec <- AUC_list[[2]]

  # Local definition
  k <- AUC_Value <- Target_Property <- NULL

  gridlines <- c(rep(0,length(k_vec)))
  for(i in seq_len((length(k_vec)-1))) {
    x_1 <- k_vec[i]
    x_2 <- k_vec[i+1]
    gridlines[i+1] <- mean(c(x_1, x_2))
  }

  ggplot_fig <- ggplot(AUC_tbl, aes(x = k, y = AUC_Value)) +
    stat_summary(aes(color = Target_Property), width = 0.8, fun.data = minmax,
                 position=position_dodge(0.8), geom = "boxplot") +
    stat_summary(aes(color = Target_Property, xmin = ), fun = outlier,
                 position=position_dodge(0.8), geom = 'point',
                 shape = 95, size = 3.5) +
    stat_boxplot(aes(color = Target_Property), width = 0.8,
                 position=position_dodge(0.8), geom = 'errorbar', coef = 10000) +
    ylim(0,1) +
    ylab("GECO Metric") +
    xlab("Number of Clusters") +
    labs(color = "Ground Truth Genes") +
    theme_bw() +
    ggtitle("GECO Cluster Quality Assessment") +
    geom_vline(xintercept=seq(from = 1.5, to = length(unique(k_vec))-0.5, by = 1),
               lwd=0.3, alpha = 0.8, colour="darkgrey") +
    theme(panel.border = element_rect(fill=NA, size = 1.5),
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_line(colour = "darkgrey"),
          plot.title = element_text(size = rel(2.0), hjust = 0.5),
          legend.position = c(0.85,0.25),
          legend.background = element_rect(fill="white",
                                           size=0.5, linetype="solid",
                                           colour ="black"),
          legend.title = element_text(size = rel(1.5)),
          legend.text = element_text(size = rel(1.25)),
          axis.text.x = element_text(angle = 90, hjust = 1, size = rel(1.5)),
          axis.title.x = element_text(size = rel(2.0)),
          axis.text.y = element_text(size = rel(2.0)),
          axis.title.y = element_text(size = rel(2.0)))

  return(ggplot_fig)
}
JasonPBennett/GECO documentation built on Aug. 30, 2021, 4:30 p.m.