R/overlap_score_summary_vs_bin_size.R

Defines functions overlap_score_summary_vs_bin_size read_overlap_score_summary_vs_bin_size

Documented in overlap_score_summary_vs_bin_size

#' @importFrom tibble as_tibble
#' @importFrom utils file_test
#' @importFrom R.cache loadCache saveCache
read_overlap_score_summary_vs_bin_size <- function(dataset, chromosome, bin_sizes, rho, reference_rho = 1/2, window_size = 5L, nsamples = 50L, weights = c("by_length", "uniform"), domain_length = NULL, path = "overlapScoreSummary", force = FALSE, ..., verbose = FALSE) {
  chromosome <- as.integer(chromosome)
  bin_sizes <- as.integer(bin_sizes)
  stop_if_not(is.numeric(rho), length(rho) == 1L, !is.na(rho), rho > 0.0, rho <= 0.5)
  if (is.character(reference_rho)) {
    reference_rho <- switch(reference_rho,
      "50%" = 1/2,
      "same" = rho,
      stop("Unknown value on 'reference_rho': ", sQuote(reference_rho))
    )
  }
  stop_if_not(is.numeric(reference_rho), length(reference_rho) == 1L, !is.na(reference_rho), reference_rho > 0.0, reference_rho <= 0.5)
  window_size <- as.integer(window_size)
  nsamples <- as.integer(nsamples)
  weights <- match.arg(weights)
  
  if (!file_test("-d", path)) {
    dir.create(path, recursive = TRUE, showWarnings = FALSE)
    stop_if_not(file_test("-d", path))
  }

  ## Tags
  chromosome_tag <- sprintf("chr=%s", chromosome)
  test_tag <- sprintf("test=%.5f", rho)
  reference_tag <- sprintf("reference=%.5f", reference_rho)
  window_size_tag <- sprintf("window_size=%d", window_size)
  if (!is.null(domain_length)) {
    stop_if_not(is.numeric(domain_length), length(domain_length) == 2L, !anyNA(domain_length), all(domain_length > 0))
    domain_length_tag <- sprintf("domain_length=%.0f-%.0f", domain_length[1], domain_length[2])
  } else {
    domain_length_tag <- NULL
  }
  weights_tag <- sprintf("weights=%s", weights)
  nsamples_tag <- sprintf("nsamples=%d", nsamples)

  if (verbose) {
    message("read_overlap_score_summary_vs_bin_size() ...")
    message("- chromosome: ", chromosome)
    message("- rho: ", rho)
    message("- window_size: ", window_size)
    message("- weights: ", weights)
    message("- nsamples: ", nsamples)
  }

  key <- list(dataset = dataset, chromosome = chromosome, bin_sizes = sort(bin_sizes), rho = rho, reference_rho = reference_rho, window_size = window_size, nsamples = nsamples, weights = weights, domain_length = domain_length)
  dirs <- c("TopDomStudy", dataset)
  if (!force) {
    summary <- loadCache(key = key, dirs = dirs)
    if (!is.null(summary)) {
      if (verbose) {
        message("read_overlap_score_summary_vs_bin_size() ... cached")
      }
      return(summary)
    }
  }
  
  summary <- list()
  for (bb in seq_along(bin_sizes)) {
    bin_size <- bin_sizes[bb]
    bin_size_tag <- sprintf("bin_size=%.0f", bin_size)
    if (verbose) message(sprintf("Bin size #%d (%s with %s and %s on Chr %s) of %d ...", bb, bin_size, test_tag, reference_tag, chromosome, length(bin_sizes)))

    tags <- c(chromosome_tag, "cells_by_half", "avg_score", bin_size_tag, test_tag, reference_tag, window_size_tag, domain_length_tag, weights_tag, nsamples_tag)

    fullname <- paste(c(dataset, tags), collapse = ",")
    pathname_summary_kk <- file.path(path, sprintf("%s.rds", fullname))
    if (verbose) message("pathname_summary_kk: ", pathname_summary_kk)

    ## Calculate on the fly?
    if (!file_test("-f", pathname_summary_kk)) {
      message("overlap_score_summary_grid() ...")
      res <- overlap_score_summary_grid(dataset = dataset, chromosomes = chromosome, bin_sizes = bin_sizes, rhos = rho, reference_rhos = reference_rho, window_size = window_size, nsamples = nsamples, weights = weights, domain_length = domain_length, verbose = verbose)
      message("overlap_score_summary_grid() ... done")
    }
    
    ## Sanity check
    stop_if_not(file_test("-f", pathname_summary_kk))
    summary[[bb]] <- read_rds(pathname_summary_kk)
            
    if (verbose) message(sprintf("Bin size #%d (%s with %s and %s on Chr %s) of %d ... done", bb, bin_size, test_tag, reference_tag, chromosome, length(bin_sizes)))
  } ## for (bb ...)

  summary <- do.call(rbind, summary)
  summary <- as_tibble(summary)
  
  if (verbose) mprint(summary)

  saveCache(summary, key = key, dirs = dirs)

  if (verbose) message("read_overlap_score_summary_vs_bin_size() ... done")

  summary
} ## read_overlap_score_summary_vs_bin_size()




#' Calculate and Summarize TopDom Overlap Scores as Function of Bin Size
#'
#' @inheritParams overlap_score_summary_grid
#' @inheritParams overlap_score_summary_vs_fraction
#'
#' @return A three-dimensional character array of pathname names where the
#' first dimension specify `chromosomes`, the second `bin_sizes`, and
#' the third 'rhos'.
#'
#' @details
#' Image files are written to the \file{figures/} folder (created if missing).
#'
#' @seealso
#' Internal, [overlap_score_summary_grid()] is used to calculate overlap
#' scores over (chromosome, bin_size, rho).
#'
#' @importFrom ggplot2 aes aes_string geom_boxplot geom_jitter ggplot ggsave ggtitle stat_summary xlab ylab xlim ylim
#' @importFrom utils file_test
#' @export
overlap_score_summary_vs_bin_size <- function(dataset, chromosomes, bin_sizes, rhos, reference_rhos = rep(1/2, times = length(rhos)), window_size = 5L, nsamples = 50L, weights = c("by_length", "uniform"), domain_length = NULL, fig_path = "figures", fig_format = c("png", "pdf"), xlim = NULL, ylim_score = c(0,1), verbose = FALSE) {
  stopifnot(is.numeric(rhos), !anyNA(rhos), all(rhos > 0), all(rhos <= 1/2))
  if (is.character(reference_rhos)) {
    reference_rhos <- switch(reference_rhos,
      "50%" = rep(1/2, times = length(rhos)),
      "same" = rhos,
      stop("Unknown value on 'reference_rhos': ", sQuote(reference_rhos))
    )
  }
  stopifnot(is.numeric(reference_rhos), !anyNA(reference_rhos), all(reference_rhos > 0), all(reference_rhos <= 1/2))
  stopifnot(length(reference_rhos) == length(rhos), all(reference_rhos >= rhos))
  weights <- match.arg(weights)
  weights_tag <- sprintf("weights=%s", weights)

  if (!file_test("-d", fig_path)) {
    dir.create(fig_path, recursive = TRUE, showWarnings = FALSE)
    stop_if_not(file_test("-d", fig_path))
  }
  fig_format <- match.arg(fig_format)

  if (!is.null(xlim)) {
    stop_if_not(length(xlim) == 2L, is.numeric(xlim), all(is.finite(xlim)),
                xlim[2] > xlim[1], xlim[1] >= 0)
  }                
  stop_if_not(is.numeric(ylim_score), length(ylim_score) == 2L,
              all(ylim_score >= 0), all(ylim_score <= 1))

  pathnames <- overlap_score_summary_grid(dataset, chromosomes = chromosomes, bin_sizes = bin_sizes, rhos = rhos, reference_rhos = reference_rhos, window_size = window_size, nsamples = nsamples, weights = weights, domain_length = domain_length, verbose = verbose)

  nsamples_tag <- sprintf("nsamples=%d", nsamples)

  window_size <- as.integer(window_size)
  window_size_tag <- sprintf("window_size=%d", window_size)

  domain_length <- attr(pathnames, "domain_length", exact = TRUE)
  if (!is.null(domain_length)) {
    stop_if_not(is.numeric(domain_length), length(domain_length) == 2L, !anyNA(domain_length), all(domain_length > 0))
    domain_length_tag <- sprintf("domain_length=%.0f-%.0f", domain_length[1], domain_length[2])
  } else {
    domain_length_tag <- NULL
  }

  path <- "overlapScoreSummary"
  stop_if_not(file_test("-d", path))

  if (verbose) message("Plotting ...")

  for (cc in seq_along(chromosomes)) {
    chromosome <- chromosomes[cc]
    chromosome_tag <- sprintf("chr=%s", chromosome)

    if (verbose) message(sprintf("Chromosome #%d (%s) of %d ...", cc, chromosome_tag, length(chromosomes)))

    for (rr in seq_along(rhos)) {
      rho <- rhos[rr]
      test_tag <- sprintf("test=%.5f", rho)
      reference_rho <- reference_rhos[rr]
      reference_tag <- sprintf("reference=%.5f", reference_rho)

      if (verbose) message(sprintf("Fraction #%d (%g on Chr %s) of %d ... done", rr, rho, chromosome, length(rhos)))

      summary <- list()
      
      for (bb in seq_along(bin_sizes)) {
        bin_size <- bin_sizes[bb]
        bin_size_tag <- sprintf("bin_size=%.0f", bin_size)
        if (verbose) message(sprintf("Bin size #%d (%s bps with %g on Chr %s) of %d ...", bb, bin_size, rho, chromosome, length(bin_sizes)))

        tags <- c(chromosome_tag, "cells_by_half", "avg_score", bin_size_tag, test_tag, reference_tag, window_size_tag, domain_length_tag, weights_tag, nsamples_tag)
        fullname <- paste(c(dataset, tags), collapse = ",")
        pathname_summary_kk <- file.path(path, sprintf("%s.rds", fullname))
        if (verbose) message("pathname_summary_kk: ", pathname_summary_kk)
        ## Sanity check
        stop_if_not(identical(pathname_summary_kk, pathnames[cc,bb,rr]))
        stop_if_not(file_test("-f", pathname_summary_kk))
        
        summary[[bb]] <- read_rds(pathname_summary_kk)
          
        if (verbose) message(sprintf("Bin size #%d (%s bps with %g on Chr %s) of %d ... already done", bb, bin_size, rho, chromosome, length(bin_sizes)))
      } ## for (bb ...)

      summary <- do.call(rbind, summary)
      if (verbose) mprint(summary)

      dw <- diff(range(bin_sizes)) / length(bin_sizes)

      length_signals <- c(
        "reference Q25 length"    = "ref_len_q0.25",
        "reference median length" = "ref_len_q0.50",
        "reference Q75 length"    = "ref_len_q0.75",
        "test Q25 length"         = "test_len_q0.25",
        "test median length"      = "test_len_q0.50",
        "test Q75 length"         = "test_len_q0.75"
      )
      signals <- c(mean = "mean", median = "`50%`", length_signals)

      fraction <- NULL; rm(list = "fraction") ## To please R CMD check

      for (signal_label in names(signals)) {
        signal <- signals[[signal_label]]

        gg <- ggplot(summary, aes_string(x = "bin_size", y = signal))

        gg <- gg + geom_boxplot(aes(group = as.factor(bin_size)), width = 0.2*dw)
        gg <- gg + geom_jitter(height = 0, width = 0.05*dw, size = 0.7, colour = "darkgray")

        gg <- gg + stat_summary(aes_string(y = signal, group = 1L),
                                fun.y = function(x) mean(x, trim = 0.10),
                                geom = "line", size = 2L, group = 1L)

        params <- c(sprintf("estimator: %s", signal_label),
                    sprintf("weights: %s", weights),
                    sprintf("domains: %.0f-%.0f", domain_length[1], domain_length[2]))
        subtitle <- sprintf("chromosome %s, test=%.5f, reference=%.5f (%d samples)\n[%s]",
                            chromosome, rho, reference_rho, nsamples, paste(params, collapse = "; "))

        gg <- gg + ggtitle(dataset, subtitle = subtitle)
        
        gg <- gg + xlab("bin size (bps)")
        if (is.null(xlim)) xlim <- c(0, max(bin_sizes))
        xlim_tag <- sprintf("xlim=%g-%g", xlim[1], xlim[2])
        gg <- gg + xlim(xlim[1], xlim[2])
        
        ylim_tag <- NULL
        if (signal_label %in% names(length_signals)) {
          gg <- gg + ylab("domain length (bps)")
          ylim <- c(0, 2e6)
        } else {
          gg <- gg + ylab("average overlap score")
          ylim <- ylim_score
          if (any(ylim != c(0,1))) {
            ylim_tag <- sprintf("ylim=%g-%g", ylim[1], ylim[2])
          }
        }
        gg <- gg + ylim(ylim[1], ylim[2])

        signal <- gsub("`50%`", "median", signal)
        tags <- sprintf("%s,chr=%s,%s,avg_score-vs-bin_size,test=%.5f,reference=%.5f,window_size=%d,nsamples=%d,signal=%s,weights=%s", dataset, chromosome, "cells_by_half", rho, reference_rho, window_size, nsamples, signal, weights)
        if (!is.null(xlim_tag)) tags <- paste(c(tags, xlim_tag), collapse=",")
        if (!is.null(ylim_tag)) tags <- paste(c(tags, ylim_tag), collapse=",")
        filename <- sprintf("%s.%s", paste(c(tags, domain_length_tag), collapse = ","), fig_format)
        if (verbose) suppressMessages <- identity
        fig_pathname <- file.path(fig_path, filename)
        suppressMessages(ggsave(gg, filename=fig_pathname))
        if (verbose) message(" - Figure written: ", fig_pathname)
      } ## for (signal ...)
        
      if (verbose) message(sprintf("Fraction #%d (%g on Chr %s) of %d ... done", rr, rho, chromosome, length(rhos)))
    } ## for (rr ...)
    
    if (verbose) message(sprintf("Chromosome #%d (%s) of %d ... done", cc, chromosome_tag, length(chromosomes)))
  } ## for (cc ...)

  if (verbose) message("Plotting ... done")

  pathnames
}
HenrikBengtsson/TopDomStudy documentation built on May 14, 2021, 1:49 p.m.