#' Get ranking and sharing indexes
#'
#' This function calculates sharing and ranking indexes for a set of samples and then
#' plots the result.
#' @param peak_files Character vector with the path of the peak files.
#' @param bam_files Character vector with the path of the bam files.
#' @param perc_overlap Percentage of overlap (from 0 to 1) with a peak in a sample with the master peaklist
#' to consider it an overlap. Default: 0.4
#' @param TxDb TxDb object or GRanges object with the gene annotation to use to classify peaks into
#' Promoters or Distal. Default: \code{TxDb.Hsapiens.UCSC.hg38.knownGene}.
#' @param nthreads Number of threads to use for the analysis.
#' @import GenomicRanges
#' @export
get_ranking_sharing_index <- function(peak_files,
bam_files,
filename="master_sharing_ranking_index.rds",
perc_overlap,
TxDb=TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene,
nthreads=2) {
## Return peak list with ranking indexes
ri_list <- mapply(ranking_index,
peak_file=peak_files,
bam_file=bam_files)
## Create master peakset with sharing index
master <- sharing_index(gr_list=ri_list)
## Add annotation
anno <- ChIPseeker::annotatePeak(master,
TxDb=TxDb,
verbose=FALSE)
master$distanceToTSS <- data.frame(anno)$distanceToTSS
master$annotation <- "Distal"
master$annotation[abs(master$distanceToTSS)<=2e3] <- "Promoter"
if(!is.null(filename)) saveRDS(master, file=filename)
return(master)
}
#' Obtain ranking index
#'
#' Given a set of peak files, obtains the ranking index for each one of them.
#' @param peak_file Character containing the path for the peak file.
#' @param bam_file Character containing the path for the bam file.
#' @param nthreads Number of threads to use for this analysis.
ranking_index <- function(peak_file,
bam_file,
nthreads=2) {
# Get number of reads in each peak ----
saf <- read.delim(peak_file, header=F, stringsAsFactors=F)[,1:4]
colnames(saf) <- c("Chr", "Start", "End", "GeneID")
saf$Strand <- "+"
counts <- Rsubread::featureCounts(bam_file,
annot.ext=saf,
nthreads=nthreads,
verbose=F)
total_reads <- sum(counts$stat[,2])
counts <- cbind(saf,
counts$counts)
counts <- counts[,colnames(counts)!="Strand"]
colnames(counts)[ncol(counts)] <- "counts"
gr <- regioneR::toGRanges(counts)
# Normalize reads Nscore = ((peak_read_count/peak_size)*10^6)*10^3/total_mapped_reads ---------
counts$counts_nscore <- ((gr$counts/GenomicRanges::width(gr)*10^6)*10^3)/total_reads
counts <- counts[order(counts$counts_nscore, decreasing=T),]
# Assign percentile from 1 (max reads) to 100 (min reads) -----
counts$percent_rank <- dplyr::percent_rank(counts$counts_nscore)
counts$ranking_index <- scales::rescale(counts$percent_rank,
to=c(100,1))
# Return GRanges with peaks & ranking index -------------
gr <- regioneR::toGRanges(counts)
return(gr)
}
#' Obtain sharing index
#'
#' @param gr_list List of peaks with the corresponding ranking indexes, as generated by \code{\link{ranking_index}}.
#' @inheritParams get_ranking_sharing_index
#' @import GenomicRanges
sharing_index <- function(gr_list,
perc_overlap=0.4) {
# Create master peakset --------------------
if (class(gr_list)=="list") gr_list <- GRangesList(gr_list)
master <- regioneR::joinRegions(unlist(gr_list))
GenomeInfoDb::seqlevels(master) <- paste0("chr", c(1:22, "X", "Y"))
master <- master[order(master),]
master$id <- paste0("peak_", 1:length(master))
# Calculate overlap of individual peaks with master ------------
master <- get_overlaps_sign(gr_list,
master,
perc_overlap)
master$sample <- gsub("NA, ", "", master$sample)
# Add mean ranking index
gr_full <- unlist(gr_list)
gr_full$sample <- gsub("_peak_[[:digit:]]*", "", gr_full$GeneID)
ols <- GenomicRanges::findOverlaps(gr_full, master)
gr_full$master_id <- NA
gr_full$master_id[S4Vectors::queryHits(ols)] <- master$id[S4Vectors::subjectHits(ols)]
stats <- data.frame(mcols(gr_full)) %>%
dplyr::group_by(sample, master_id) %>%
dplyr::summarise(ranking_index=mean(ranking_index)) %>%
dplyr::group_by(master_id) %>%
dplyr::summarise(ranking_index=mean(ranking_index))
mcols(master) <- dplyr::left_join(data.frame(mcols(master)),
stats,
by=c("id"="master_id"))
return(master)
}
#' Get significant overlaps
#'
#' @param gr_list List of peaks with the corresponding ranking indexes, as generated by \code{\link{ranking_index}}.
#' @param master Master peak list.
#' @inheritParams get_ranking_sharing_index
#' @import GenomicRanges
get_overlaps_sign <- function(gr_list, master, perc_overlap=0.4) {
for (i in 1:length(gr_list)) {
gr <- gr_list[[i]]
name <- unique(gsub("_peak_[[:digit:]]*", "", gr$GeneID))
ols <- GenomicRanges::findOverlaps(master, gr)
perc <- width(pintersect(master[S4Vectors::queryHits(ols),],
gr[S4Vectors::subjectHits(ols),]))/width(gr[S4Vectors::subjectHits(ols)])
sel <- perc > perc_overlap
present <- unique(master$id[S4Vectors::queryHits(ols)][sel])
if ("sharing_index" %in% colnames(mcols(master))) {
master$sharing_index[master$id %in% present] <- master$sharing_index[master$id %in% present] + 1
master$sample[master$id %in% present] <- paste(master$sample[master$id %in% present], name, sep=", ")
} else {
master$sharing_index <- 0
master$sharing_index[master$id %in% present] <- master$sharing_index[master$id %in% present] + 1
master$sample <- NA
master$sample[master$id %in% present] <- name
}
}
return(master)
}
#' Plot sharing index vs ranking index
#'
#' This function calculates sharing and ranking indexes for a set of samples and then
#' plots the result.
#' @param master Data.frame or path to the saved RData file containing the master peakset with ranking and
#' sharing indexes as generated with \code{\link{}}
#' @param plot_title Character containing the title for the final plot.
#' @import ggplot2
#' @export
plot_ranking_sharing_index <- function(master,
plot_title,
theme=theme_classic()) {
if(is.character(master)) master <- readRDS(master)
## Convert data for plotting
data <- data.frame(mcols(master))
data$sharing_index <- factor(data$sharing_index)
sum <- data %>%
dplyr::group_by(sharing_index, annotation) %>%
dplyr::summarise(count=length(id))
## Plot summary (# of peaks in each group)
summary <-
ggplot(sum,
aes(sharing_index, count)) +
geom_line(aes(group=annotation)) +
geom_point(aes(fill=sharing_index), pch=21, size=3) +
viridis::scale_fill_viridis(discrete=T, direction=-1, option="D") +
scale_y_continuous(labels=function(x) scales::comma(x), name="# of peaks") +
facet_wrap(~annotation) +
theme +
theme(legend.position="none",
panel.grid.major.y=element_line(size=.25, linetype=1, color="dark grey"),
panel.grid.minor.y=element_line(size=.25, linetype=2, color="grey"),
axis.text.x=element_blank(),
axis.title.x=element_blank(),
axis.line.x=element_blank(),
axis.ticks.x=element_blank()) +
ggtitle(plot_title)
## Plot boxplot ranking index
box <-
ggplot(data,
aes(sharing_index, ranking_index)) +
geom_jitter(aes(color=sharing_index), alpha=.5, size=.5) +
geom_boxplot(fill="white", alpha=.5, outlier.shape=NA, notch=T) +
viridis::scale_color_viridis(discrete=T, alpha=.5, direction=-1, option="D") +
scale_y_reverse(limits=c(100,1),
expand=c(0,0),
breaks=c(100,50,1),
name="Rank index (RI)") +
xlab("Sharing index (SI)") +
facet_wrap(~annotation) +
theme +
theme(legend.position="none",
strip.text = element_blank(),
strip.background = element_blank())
cowplot::plot_grid(summary, box,
ncol=1,
align="v",
rel_heights = c(0.3,0.7))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.