R/trend_plot_tinyClusters.r

Defines functions trend_plot_tinyClusters

Documented in trend_plot_tinyClusters

#' Trend plot of outlier clusteres
#'
#' Plotting the number of tiny clusters and unclassified cases
#' as the subregional size (hex_len) and different number of clusters (k) are
#' used.
#'
#' @param root_dir A directory path pointing to the parent directory of the
#'   ouptput folders containing TIPC metric counts generated by
#'   \code{\link[TIPC]{count_TIPC_cat}} at various hexagonal lengths.
#' @param output_dir A directory path for saving output plot; if NULL, plots will
#' be saved in root_dir
#' @param hex_len_range A vector of 2 integer number indicating the range of
#'   hexagonal length. If unspecified (NULL), all found TIPC result folders of
#'   different hexagonal lengths will be processed.
#' @param hex_len_stepsize An integer indicating the step size of hexagonal
#'   length. If unspecified (NULL), all found TIPC result folders of
#'   different hexagonal lengths will be processed.
#' @param min_cluster_size An integer indicating the minimum size of TIPC
#'   cluster to be tested for survival analysis; default 10.
#' @param clustering_subfolder_nm A character string indicating the subfolder
#' name containing the TIPC clustering results within each hex_len.
#' @param pdf_width,pdf_height the width and height of the graphics region in inches.
#' The default values are 18 and 12; inherited from \code{\link[grDevices]{pdf}}
#' @export
#' @importFrom grDevices pdf dev.off
#' @import ggplot2
#' @import dplyr
#' @importFrom tidyr gather
trend_plot_tinyClusters <- function(root_dir =  NULL, hex_len_range = NULL, hex_len_stepsize =  NULL,
                                    output_dir=NULL,
                                    pdf_width=12, pdf_height=8,min_cluster_size = 30,
                                    clustering_subfolder_nm='ConsensusClusterPlus_test'){
  cluster_no <- k <- total_outlier_cases <- no_effective_cluster <- no_outlier_cluster <- NULL
  ## ======================
  ## check argument root_dir
  ## ======================
  if(is.null(root_dir)) stop('No root_dir is provided!\n')
  if(is.null(output_dir))output_dir <- root_dir
  ## ======================
  ## identify all result folders generated by tessellation
  ## ======================
  subfolder_nms <- list.dirs(path = root_dir, full.names = TRUE, recursive = FALSE)
  ## subsetting for output folders from tessellation
  subfolder_nms <- grep(x= subfolder_nms, pattern = 'TIPC_hexLen', value = TRUE)

  ## ======================
  ## subsettting for result folders specified by users
  ## ======================
  if(!is.null(hex_len_range)){
    hex_lens <- seq(from = min(hex_len_range), to = max(hex_len_range), by = hex_len_stepsize)
    subfolder_nms <- grep(x=subfolder_nms, pattern = paste0(hex_lens, collapse = '|'), value = TRUE)
  }

  ## ======================
  ## ======================
  ## consolidate all hex_len and k
  ## ======================
  ## ======================
  res_df <- list()
  ll <- 1
  for (f_nm in subfolder_nms){
    ## ---------------
    ## extract hex_len
    ## ---------------
    hex_lenX <- sapply(strsplit(f_nm, split = 'hexLen'), "[[", 2)
    hex_lenX <- as.integer(hex_lenX)

    ## ---------------
    ## loop over each k
    ## ---------------
    k_dirs <- list.dirs(path = file.path(f_nm,clustering_subfolder_nm), recursive = FALSE)
    k_dirs <- basename(k_dirs)
    k_dirs <- k_dirs[grep(x=k_dirs, 'k[0-9]')]

    k_dirs <- file.path(f_nm,clustering_subfolder_nm, k_dirs)
    for (kk in k_dirs){
      setwd(kk)
      cluster_kk <- read.csv(file = list.files(pattern = 'cluster_no_k'), as.is = TRUE)
      res_kk <- cluster_kk %>%
        group_by(cluster_no) %>%
        count(cluster_no)

      res_kk$hex_len <- hex_lenX
      res_kk$k <- max(res_kk$cluster_no)

      res_kk$no_outlier_cluster <- sum(res_kk$n<min_cluster_size)
      res_kk$no_effective_cluster <- sum(res_kk$n>=min_cluster_size)
      res_kk$total_outlier_cases <- sum(res_kk$n[res_kk$n<min_cluster_size])

      res_df[[ll]] <- res_kk
      ll <- ll+1
    }

  }# end all subfolder_nms i.e. hex_len

  all_hexLen_k <- do.call(rbind, res_df)

  ## ======================
  ## ======================
  ## plotting: effect of hex_len
  ## ======================
  ## ======================

  ## ---------------
  ## total. no. of outlier/unclassified cases
  ## ---------------
  plot_fnm <- file.path(output_dir, 'total_no_unclassifed_cases_by_hexlen_N_k.pdf')
  pl <- ggplot(data=all_hexLen_k, aes(x = k, y=total_outlier_cases)) +
    geom_point(size=4) +
    scale_x_discrete(limits=c(range(all_hexLen_k$k)))+
    ylab('total no. of unclassified tumors')+
    facet_wrap(~ hex_len) + theme_bw()+
    theme(axis.title = element_text(size = 26, face = 'bold'), axis.text = element_text(size = 24),
          strip.text.x = element_text(size = 26, face='bold'))
  ggsave(plot = pl, filename = plot_fnm, width=pdf_width, height=pdf_height)

  ## ---------------
  ## no. of outlier clusters & effective clusters
  ## ---------------
  plot_fnm <- file.path(output_dir, 'No_effective_vs_outlier_clusters_by_hexlen_N_k.pdf')
  pl <- ggplot(data=all_hexLen_k, aes(x = k, y=no_effective_cluster, color='no. of effective clusters')) +
    geom_point(size=4) +
    geom_point(size=4, aes(x = k, y=no_outlier_cluster, color='no. of outlier clusters')) +
    scale_x_discrete(limits=c(range(all_hexLen_k$k)))+
    ylab('No. of clusters')+
    facet_wrap(~ hex_len) + theme_bw()+
    theme(legend.position='bottom' ,
          legend.text = element_text(size = 26),legend.title = element_text(size = 26),
          axis.title = element_text(size = 26, face = 'bold'), axis.text = element_text(size = 24),
          strip.text.x = element_text(size = 26, face='bold'))+
    labs(color="Cluster type")+
    guides(color=guide_legend(ncol=1, override.aes = list(size = 7)))
  ggsave(plot = pl, filename = plot_fnm, width=pdf_width, height=pdf_height)

}
MPE-Lab/TIPC documentation built on Sept. 17, 2021, 6:33 p.m.