R/sharingIndex.R

Defines functions plot_ranking_sharing_index get_overlaps_sign sharing_index ranking_index get_ranking_sharing_index

Documented in get_overlaps_sign get_ranking_sharing_index plot_ranking_sharing_index ranking_index sharing_index

#' Get ranking and sharing indexes
#'
#' This function calculates sharing and ranking indexes for a set of samples and then
#' plots the result.
#' @param peak_files Character vector with the path of the peak files.
#' @param bam_files Character vector with the path of the bam files.
#' @param perc_overlap Percentage of overlap (from 0 to 1) with a peak in a sample with the master peaklist
#' to consider it an overlap. Default: 0.4
#' @param TxDb TxDb object or GRanges object with the gene annotation to use to classify peaks into
#' Promoters or Distal. Default: \code{TxDb.Hsapiens.UCSC.hg38.knownGene}.
#' @param nthreads Number of threads to use for the analysis.
#' @import GenomicRanges
#' @export
get_ranking_sharing_index <- function(peak_files,
                                      bam_files,
                                      filename="master_sharing_ranking_index.rds",
                                      perc_overlap,
                                      TxDb=TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene,
                                      nthreads=2) {

  ## Return peak list with ranking indexes
  ri_list <- mapply(ranking_index,
                    peak_file=peak_files,
                    bam_file=bam_files)

  ## Create master peakset with sharing index
  master <- sharing_index(gr_list=ri_list)

  ## Add annotation
  anno <- ChIPseeker::annotatePeak(master,
                                   TxDb=TxDb,
                                   verbose=FALSE)


  master$distanceToTSS <- data.frame(anno)$distanceToTSS
  master$annotation <- "Distal"
  master$annotation[abs(master$distanceToTSS)<=2e3] <- "Promoter"

  if(!is.null(filename)) saveRDS(master, file=filename)

  return(master)
}

#' Obtain ranking index
#'
#' Given a set of peak files, obtains the ranking index for each one of them.
#' @param peak_file Character containing the path for the peak file.
#' @param bam_file Character containing the path for the bam file.
#' @param nthreads Number of threads to use for this analysis.
ranking_index <- function(peak_file,
                          bam_file,
                          nthreads=2) {
  # Get number of reads in each peak ----
  saf <- read.delim(peak_file, header=F, stringsAsFactors=F)[,1:4]
  colnames(saf) <- c("Chr", "Start", "End", "GeneID")
  saf$Strand <- "+"

  counts <- Rsubread::featureCounts(bam_file,
                                    annot.ext=saf,
                                    nthreads=nthreads,
                                    verbose=F)

  total_reads <- sum(counts$stat[,2])
  counts <- cbind(saf,
                  counts$counts)
  counts <- counts[,colnames(counts)!="Strand"]
  colnames(counts)[ncol(counts)] <- "counts"
  gr <- regioneR::toGRanges(counts)

  # Normalize reads Nscore = ((peak_read_count/peak_size)*10^6)*10^3/total_mapped_reads ---------
  counts$counts_nscore <- ((gr$counts/GenomicRanges::width(gr)*10^6)*10^3)/total_reads

  counts <- counts[order(counts$counts_nscore, decreasing=T),]

  # Assign percentile from 1 (max reads) to 100 (min reads) -----
  counts$percent_rank <- dplyr::percent_rank(counts$counts_nscore)
  counts$ranking_index <- scales::rescale(counts$percent_rank,
                                          to=c(100,1))

  # Return GRanges with peaks & ranking index -------------
  gr <- regioneR::toGRanges(counts)

  return(gr)
}


#' Obtain sharing index
#'
#' @param gr_list List of peaks with the corresponding ranking indexes, as generated by \code{\link{ranking_index}}.
#' @inheritParams get_ranking_sharing_index
#' @import GenomicRanges
sharing_index <- function(gr_list,
                          perc_overlap=0.4) {
  # Create master peakset --------------------
  if (class(gr_list)=="list") gr_list <- GRangesList(gr_list)
  master <- regioneR::joinRegions(unlist(gr_list))
  GenomeInfoDb::seqlevels(master) <- paste0("chr", c(1:22, "X", "Y"))
  master <- master[order(master),]
  master$id <- paste0("peak_", 1:length(master))

  # Calculate overlap of individual peaks with master ------------
  master <- get_overlaps_sign(gr_list,
                              master,
                              perc_overlap)
  master$sample <- gsub("NA, ", "", master$sample)

  # Add mean ranking index
  gr_full <- unlist(gr_list)
  gr_full$sample <- gsub("_peak_[[:digit:]]*", "", gr_full$GeneID)

  ols <- GenomicRanges::findOverlaps(gr_full, master)
  gr_full$master_id <- NA
  gr_full$master_id[S4Vectors::queryHits(ols)] <- master$id[S4Vectors::subjectHits(ols)]

  stats <- data.frame(mcols(gr_full)) %>%
    dplyr::group_by(sample, master_id) %>%
    dplyr::summarise(ranking_index=mean(ranking_index)) %>%
    dplyr::group_by(master_id) %>%
    dplyr::summarise(ranking_index=mean(ranking_index))

  mcols(master) <- dplyr::left_join(data.frame(mcols(master)),
                                    stats,
                                    by=c("id"="master_id"))

  return(master)
}

#' Get significant overlaps
#'
#' @param gr_list List of peaks with the corresponding ranking indexes, as generated by \code{\link{ranking_index}}.
#' @param master Master peak list.
#' @inheritParams get_ranking_sharing_index
#' @import GenomicRanges
get_overlaps_sign <- function(gr_list, master, perc_overlap=0.4) {
  for (i in 1:length(gr_list)) {
    gr <- gr_list[[i]]
    name <- unique(gsub("_peak_[[:digit:]]*", "", gr$GeneID))

    ols <- GenomicRanges::findOverlaps(master, gr)
    perc <- width(pintersect(master[S4Vectors::queryHits(ols),],
                             gr[S4Vectors::subjectHits(ols),]))/width(gr[S4Vectors::subjectHits(ols)])
    sel <- perc > perc_overlap

    present <- unique(master$id[S4Vectors::queryHits(ols)][sel])

    if ("sharing_index" %in% colnames(mcols(master))) {
      master$sharing_index[master$id %in% present] <- master$sharing_index[master$id %in% present] + 1
      master$sample[master$id %in% present] <- paste(master$sample[master$id %in% present], name, sep=", ")

    } else {
      master$sharing_index <- 0
      master$sharing_index[master$id %in% present] <- master$sharing_index[master$id %in% present] + 1

      master$sample <- NA
      master$sample[master$id %in% present] <- name
    }
  }

  return(master)
}

#' Plot sharing index vs ranking index
#'
#' This function calculates sharing and ranking indexes for a set of samples and then
#' plots the result.
#' @param master Data.frame or path to the saved RData file containing the master peakset with ranking and
#' sharing indexes as generated with \code{\link{}}
#' @param plot_title Character containing the title for the final plot.
#' @import ggplot2
#' @export
plot_ranking_sharing_index <- function(master,
                                       plot_title,
                                       theme=theme_classic()) {

  if(is.character(master)) master <- readRDS(master)

  ## Convert data for plotting
  data <- data.frame(mcols(master))
  data$sharing_index <- factor(data$sharing_index)

  sum <- data %>%
    dplyr::group_by(sharing_index, annotation) %>%
    dplyr::summarise(count=length(id))

  ## Plot summary (# of peaks in each group)
  summary <-
    ggplot(sum,
           aes(sharing_index, count)) +
    geom_line(aes(group=annotation)) +
    geom_point(aes(fill=sharing_index), pch=21, size=3) +
    viridis::scale_fill_viridis(discrete=T, direction=-1, option="D") +
    scale_y_continuous(labels=function(x) scales::comma(x), name="# of peaks") +
    facet_wrap(~annotation) +
    theme +
    theme(legend.position="none",
          panel.grid.major.y=element_line(size=.25, linetype=1, color="dark grey"),
          panel.grid.minor.y=element_line(size=.25, linetype=2, color="grey"),
          axis.text.x=element_blank(),
          axis.title.x=element_blank(),
          axis.line.x=element_blank(),
          axis.ticks.x=element_blank()) +
    ggtitle(plot_title)

  ## Plot boxplot ranking index
  box <-
    ggplot(data,
           aes(sharing_index, ranking_index)) +
    geom_jitter(aes(color=sharing_index), alpha=.5, size=.5) +
    geom_boxplot(fill="white", alpha=.5, outlier.shape=NA, notch=T) +
    viridis::scale_color_viridis(discrete=T, alpha=.5, direction=-1, option="D") +
    scale_y_reverse(limits=c(100,1),
                    expand=c(0,0),
                    breaks=c(100,50,1),
                    name="Rank index (RI)") +
    xlab("Sharing index (SI)") +
    facet_wrap(~annotation) +
    theme +
    theme(legend.position="none",
          strip.text = element_blank(),
          strip.background = element_blank())

  cowplot::plot_grid(summary, box,
                     ncol=1,
                     align="v",
                     rel_heights = c(0.3,0.7))
}
mireia-bioinfo/meowmics documentation built on July 29, 2023, 10 p.m.