R/score_clusters.R

Defines functions score_clusters get_GT_genes

Documented in score_clusters

get_GT_genes <- function(GT_dir) {
  gt_sets <- list()
  if(!dir.exists(GT_dir)) {
    stop("Ground truth directory not found! Make sure to supply the full path.")
  } else {
    fnames <- list.files(GT_dir, pattern="*.csv", full.names=TRUE)
  }

  # Append the GT sets to our GT list
  lbls <- c()
  for(file in fnames) {
    tmp <- utils::read.table(file, sep = ",", header = FALSE, stringsAsFactors = FALSE)
    lbls <- c(lbls, tmp[1,])
    gt_sets <- append(gt_sets, as.data.frame(tmp[-1,], stringsAsFactors = FALSE))
  }
  names(gt_sets) <- lbls

  gt_sets
}

#' Score the k-means clusters.
#'
#' This function takes the k-means clusters generated by generate_clusters and
#' computes the GECO scores for each gene within each cluster. This scores table
#' is meant to be used with the final function in the GECO process:
#' assess_quality.
#'
#' @param km_clusters The k-means clusters generated by generate_clusters
#' @param GT_dir The directory holding the ground truth .csv files
#' @return The scores table containing the GECO scored clusters
#' @examples
#' clusters <- list()
#' # This replicates the structure of the clusters returned by 'generate_clusters'
#' df <- data.frame(replicate(10,sample(-1:10,200,rep=TRUE)))
#' rownames(df) <- paste0(rep("Gene.", 200), seq(1:200))
#' clusters$`Iteration 1`$`10` <- kmeans(df, centers = 10)
#' # This replicates a ground truth set
#' gt_set <- c("Ground Truth Set X", "Gene.1", "Gene.10", "Gene.50")
#' # Store the Ground Truth set in its own directory
#' dir.create("./GECOdata")
#' write.table(gt_set, file = "./GECOData/gs_genes.csv", row.names = FALSE, col.names = FALSE)
#' # Score the clusters using the ground truth sets found in /GECOdata
#' scores <- score_clusters(clusters, "./GECOdata")
#' @export
score_clusters <- function(km_clusters, GT_dir) {
  # Get the ground truth sets
  gt_sets <- get_GT_genes(GT_dir)

  # Ensure that the ground truth genes can be found within the dataset
  all_genes <- names(km_clusters[[1]][[1]]$cluster)
  for(i in seq_len(length(gt_sets))) {
    if(sum(gt_sets[[i]] %in% all_genes) <= 0) {
      stop(paste0("No genes from the ground truth set '", names(gt_sets[i]), "' can be found in the data!"))
    }
  }

  # Create the scores list
  scores <- list()
  gt_cnt <- 0

  for(gt_set in gt_sets){
    gt_cnt <- gt_cnt + 1
    # Store the scoring tables
    gt_ROC_list <- vector(mode = "list", length(km_clusters))
    # Find number of iterations
    num_iter <- length(km_clusters)

    # Work through the iterations: 1 - num_iter
    for(itr in seq_len(num_iter)) {
      clust_itr <- km_clusters[[itr]]
      k_vec <- names(clust_itr)

      gt_ROC <- vector(mode = "list", sum(clust_itr[[itr]]$size))
      # Work through the k-values: kmin - kmax
      for(k in k_vec) {
        clust <- clust_itr[[k]]
        clust_num <- vector(mode = "list", as.numeric(k))
        gene_name <- vector(mode = "list", as.numeric(k))
        pos_vec <- vector(mode = "list", as.numeric(k))
        prob_vec <- vector(mode = "list", as.numeric(k))
        # Check each cluster for GT genes
        for(i in seq_len(k)) {
          gene_name[[i]] <- names(which(clust$cluster == i))
          # If cluster is not a singlet
          if(clust$size[i]>1) {
            clust_num[[i]] <- rep(i, clust$size[i])
            pos_vec[[i]] <- gene_name[[i]] %in% gt_set
            gt_bf <- ( (length(gt_set) - pos_vec[[i]]) / (sum(clust$size) - 1) )
            prob_vec[[i]] <- ( (sum(pos_vec[[i]]) - pos_vec[[i]]) / (clust$size[i] - 1) ) / gt_bf
          }
          # If cluster is a singlet
          else {
            clust_num[[i]] <- i
            pos_vec[[i]] <- gene_name[[i]] %in% gt_set
            prob_vec[[i]] <- length(gt_set) / (sum(clust$size) - 1)
          }
        }
        clust_num <- unlist(clust_num)
        gene_name <- unlist(gene_name)
        pos_vec <- unlist(pos_vec)
        prob_vec <- unlist(prob_vec)

        score_tbl <- data.frame(clust_num, gene_name, pos_vec, prob_vec)
        gt_ROC[[k]] <- score_tbl
      }
      gt_ROC <- Filter(Negate(is.null), gt_ROC)
      names(gt_ROC) <- k_vec
      gt_ROC_list[[itr]] <- gt_ROC
    }
    names(gt_ROC_list) <- paste0("Iteration ", c(seq_len(num_iter)), sep = "")
    scores[[gt_cnt]] <- gt_ROC_list
  }
  names(scores) <- names(gt_sets)

  return(scores)
}
JasonPBennett/GECO documentation built on Aug. 30, 2021, 4:30 p.m.