utils::globalVariables(c("feature_id", "gene_id"))
#' mm_nearestGene
#'
#' Annotate peaks using nearest gene mode, which means every peak only have one
#' related gene.
#'
#' @import GenomeInfoDb GenomicFeatures S4Vectors BiocGenerics GenomicRanges
#'
#' @param peak_GR peak GRange with a column named feature_id representing you
#' peak name
#' @param Txdb Txdb
#' @param reportGeneInfo whether you want to report full gene info
#' @param verbose whether you want to report detailed running message
#' @param ... additional arguments in distanceToNearest
#'
#' @return Granges object with annotated info
#' @export
#'
#' @examples
#'
#' if (require(TxDb.Athaliana.BioMart.plantsmart28)) {
#' Txdb <- TxDb.Athaliana.BioMart.plantsmart28
#' seqlevels(Txdb) <- paste0("Chr", c(1:5, "M", "C"))
#'
#' peak_path <- system.file("extdata", "ChIP.bed.gz", package = "FindIT2")
#' peak_GR <- loadPeakFile(peak_path)
#' peakAnno <- mm_nearestGene(peak_GR, Txdb)
#' peakAnno
#' }
mm_nearestGene <- function(peak_GR,
Txdb,
reportGeneInfo = FALSE,
verbose = TRUE,
...) {
peak_GR <- check_peakGR(peak_GR = peak_GR,
Txdb = Txdb)
check_duplicated(peak_GR)
if (verbose){
message("------------")
message("annotating Peak using nearest gene mode begins")
message(
">> preparing gene features information...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
gene_location <- genes(Txdb)
gene_start <- resize(gene_location, width = 1, fix = "start")
# some scaffold in Txdb may not have gene
# while these scaffold still have peak
# These scaffold still matain when
# chrominfo <- read.table("XX.fa.fai")[, 1:2]
# colnames(chrominfo) <- c("chrom", "length")
# Txdb <- makeTxDbFromGFF("XXgtf", chrominfo = chrominfo)
chrom_withGene <- unique(as.character(seqnames(gene_location)))
peak_GR <- subset(peak_GR, seqnames %in% chrom_withGene)
if (verbose){
message(
">> finding nearest gene and calculating distance...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
# It seems that there is two distanceToNearest method,
# one is from GenomicGRanges, another is from IRanges
# both support ignore.strand while the latter do not show in help
# if peak is unstrand(for most cases), the ignore.strand doesn't matter
nearestHit <- suppressWarnings(distanceToNearest(
peak_GR,
gene_start, ...
))
if (length(nearestHit) == 0) {
stop("Sorry, there is no hits.",
call. = FALSE
)
}
if (verbose){
message(
">> dealing with gene strand ...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
a <- data.frame(
start_dis = start(peak_GR[queryHits(nearestHit)]) -
start(gene_start[subjectHits(nearestHit)]),
end_dis = end(peak_GR[queryHits(nearestHit)]) -
start(gene_start[subjectHits(nearestHit)])
)
# if start and end are all in TSS left, sum(x > 0)) it will be 2.
# if start is in left while end is in right, sum(x > 0)) it will be 1
# if start and end are all in TSS right, sum(x > 0)) it will be 0
sign_1 <- ifelse(apply(a, 1, function(x) sum(x > 0)) == 2, 1, -1)
# sign_1 and sign_2 will decide distance's sign
sign_2 <- ifelse(strand(gene_start[subjectHits(nearestHit)]) == "+", 1, -1)
gene_id <- names(gene_start)[subjectHits(nearestHit)]
if (verbose){
message(
">> merging all info together ...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
if (reportGeneInfo) {
# someone's peakN may be too big, in this case,
# they may do not want full gene info
# so they can save disk or memory
mcols(peak_GR) <- cbind(
mcols(peak_GR),
report_geneInfo(gene_location[gene_id])
)
mcols(peak_GR)$distanceToTSS <- mcols(nearestHit)$distance * sign_1 * sign_2
} else {
mcols(peak_GR)$gene_id <- gene_id
mcols(peak_GR)$distanceToTSS <- mcols(nearestHit)$distance * sign_1 * sign_2
}
metadata(peak_GR)$mmAnno_mode <- "nearestGene"
if (verbose){
message(
">> done\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
return(peak_GR)
}
#' mm_geneScan
#'
#' Annotate peaks using geneScan mode, which means every peak have more than one
#' related genes.
#'
#' @import GenomeInfoDb GenomicFeatures S4Vectors BiocGenerics GenomicRanges
#'
#' @param peak_GR peak GRange with a column named feature_id representing you peak name
#' @param Txdb Txdb
#' @param upstream distance to start site(upstream)
#' @param downstream distance to start site(downstream)
#' @param reportGeneInfo whether you want to add gene info
#' @param verbose whether you want to report detailed running message
#' @param ... additional arguments in findOverlaps
#'
#' @return Granges object with annotated info
#' @export
#'
#' @examples
#' if (require(TxDb.Athaliana.BioMart.plantsmart28)) {
#' Txdb <- TxDb.Athaliana.BioMart.plantsmart28
#' seqlevels(Txdb) <- paste0("Chr", c(1:5, "M", "C"))
#' peak_path <- system.file("extdata", "ChIP.bed.gz", package = "FindIT2")
#' peak_GR <- loadPeakFile(peak_path)
#' peakAnno <- mm_geneScan(peak_GR, Txdb)
#' peakAnno
#' }
mm_geneScan <- function(peak_GR,
Txdb,
upstream = 3000,
downstream = 3000,
reportGeneInfo = FALSE,
verbose = TRUE,
...) {
peak_GR <- check_peakGR(peak_GR = peak_GR,
Txdb = Txdb)
check_duplicated(peak_GR)
if (verbose){
message("------------")
message("annotatePeak using geneScan mode begins")
message(
">> preparing gene features information and scan region...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
gene_location <- GenomicFeatures::genes(Txdb)
gene_start <- resize(gene_location, width = 1, fix = "start")
# some scaffold in Txdb may not have gene
# while these scaffold still have peak
# These scaffold still matain when
# chrominfo <- read.table("XX.fa.fai")[, 1:2]
# colnames(chrominfo) <- c("chrom", "length")
# Txdb <- makeTxDbFromGFF("XXgtf", chrominfo = chrominfo)
chrom_withGene <- unique(as.character(seqnames(gene_location)))
peak_GR <- subset(peak_GR, seqnames %in% chrom_withGene)
gene_promoter <- suppressWarnings(promoters(gene_location,
upstream = upstream,
downstream = downstream
))
Txdb_seqlength <- seqlengths(Txdb)
if (all(is.na(Txdb_seqlength))) {
warning("your chr length is not set, gene_promoter may cross bound",
call. = FALSE
)
} else {
message(
">> some scan range may cross Chr bound, trimming...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
gene_promoter <- trim(gene_promoter)
}
if (verbose){
message(
">> finding overlap peak in gene scan region...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
overlapHits <- suppressWarnings(
findOverlaps(gene_promoter,peak_GR,...)
)
if (verbose){
message(
">> dealing with left peak not your gene scan region...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
# message(">> these peak will be annotated by nearest gene\n")
peak_GR_left <- subset(
peak_GR,
!feature_id %in% unique(peak_GR[subjectHits(overlapHits)]$feature_id)
)
# if I use nearest, it will report 0 when no peak_GR left
# so I use distanceToNearest
nearest_hit <- suppressWarnings(distanceToNearest(peak_GR_left,gene_start))
gene_TSS_left <- gene_start[subjectHits(nearest_hit)]
if (verbose){
message(
">> merging two set peaks...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
final_gene_GR <- c(gene_start[queryHits(overlapHits)], gene_TSS_left)
final_peak_GR <- c(peak_GR[subjectHits(overlapHits)], peak_GR_left)
if (verbose){
message(
">> calculating distance and dealing with gene strand...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
dist <- suppressWarnings(distance(final_peak_GR,final_gene_GR))
a <- data.frame(
start_dis = start(final_peak_GR) - start(final_gene_GR),
end_dis = end(final_peak_GR) - start(final_gene_GR)
)
sign_1 <- ifelse(apply(a, 1, function(x) sum(x > 0)) == 2, 1, -1)
sign_2 <- ifelse(strand(final_gene_GR) == "+", 1, -1)
gene_id <- names(final_gene_GR)
if (verbose) {
message(
">> merging all info together ...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
if (reportGeneInfo) {
mcols(final_peak_GR) <- cbind(
mcols(final_peak_GR),
report_geneInfo(gene_location[gene_id])
)
mcols(final_peak_GR)$distanceToTSS <- dist * sign_1 * sign_2
} else {
mcols(final_peak_GR)$gene_id <- gene_id
mcols(final_peak_GR)$distanceToTSS <- dist * sign_1 * sign_2
}
metadata(final_peak_GR)$mmAnno_mode <- "geneScan"
metadata(final_peak_GR)$upstream <- upstream
metadata(final_peak_GR)$downstream <- downstream
if (verbose){
message(
">> done\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
return(final_peak_GR)
}
#' mm_geneBound
#'
#' find related peaks of your input genes, which is useful when you want to plot
#' volcano plot or heatmap of peaks.
#'
#' @import GenomeInfoDb GenomicFeatures S4Vectors BiocGenerics GenomicRanges
#' @importFrom utils capture.output
#'
#' @param peak_GR peak GRange with a column named feature_id representing you peak name
#' @param Txdb Txdb
#' @param input_genes a character vector which represent genes set
#' which you want to find related peak for
#' @param verbose whether you want to report detailed running message
#' @param ... additional arguments in distanceToNearest
#'
#' @return data.frame with three column: related peak id, your input gene id,
#' and distance
#' @export
#'
#' @examples
#' if (require(TxDb.Athaliana.BioMart.plantsmart28)) {
#' Txdb <- TxDb.Athaliana.BioMart.plantsmart28
#' seqlevels(Txdb) <- paste0("Chr", c(1:5, "M", "C"))
#' peak_path <- system.file("extdata", "ChIP.bed.gz", package = "FindIT2")
#' peak_GR <- loadPeakFile(peak_path)
#' peak_pair <- mm_geneBound(peak_GR, Txdb, c("AT5G01015", "AT5G67570"))
#' peak_pair
#' }
mm_geneBound <- function(peak_GR,
Txdb,
input_genes,
verbose = TRUE,
...) {
# some genes maybe 12345……
input_genes <- as.character(unique(input_genes))
peak_GR <- check_peakGR(peak_GR = peak_GR,Txdb = Txdb)
check_duplicated(peak_GR)
if (verbose){
message(
">> using mm_nearestGene to annotate Peak...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
mmAnno <- mm_nearestGene(peak_GR = peak_GR,
Txdb = Txdb,
reportGeneInfo = FALSE,...)
mmAnno$distanceToTSS_abs <- abs(mmAnno$distanceToTSS)
mmAnno_inputGenes <- mcols(subset(mmAnno, gene_id %in% input_genes))
mmAnno_inputGenes <- mmAnno_inputGenes[, c("feature_id", "gene_id", "distanceToTSS_abs")]
mmAnno_inputGenes <- tibble::tibble(data.frame(mmAnno_inputGenes,
stringsAsFactors = FALSE))
noPeakBind_genes <- input_genes[!input_genes %in% mmAnno$gene_id]
if (length(noPeakBind_genes) == 0) {
message("all your input gene have been annotated by nearestGene mode")
return(mmAnno_inputGenes)
}
gene_location <- GenomicFeatures::genes(Txdb)
all_genes <- gene_location$gene_id
withr::local_options(list(warn = 1))
if (mean(noPeakBind_genes %in% all_genes) < 1){
warning("some of your input genes is not in your Txdb, I filtered these gene\n",
call. = FALSE
)
noPeakBind_genes <- noPeakBind_genes[noPeakBind_genes %in% all_genes]
}
peak_GR_Chr <- seqlevels(peak_GR)
gene_location_inChr <- gene_location[seqnames(gene_location) %in% peak_GR_Chr]
all_genes_inChr <- gene_location_inChr$gene_id
if (all(!input_genes %in% all_genes_inChr)){
stop("sorry, all of your input genes all not in your peak_GR chrosome",
call. = FALSE)
}
if (mean(noPeakBind_genes %in% all_genes_inChr) < 1){
noPeakBind_genes <- noPeakBind_genes[noPeakBind_genes %in% all_genes_inChr]
if (length(noPeakBind_genes) == 0){
msg <- paste0("some of your input gene are not in your peak_GR chrosome ",
"and left all gene have been annotated in nearestGene mode\n")
warning(msg, call. = FALSE)
return(mmAnno_inputGenes)
} else {
msg <- paste0("some of your input gene are not in your peak_GR chrosome ",
"I filtered these gene")
warning(msg, call. = FALSE)
}
}
msg <- paste0("It seems that there ",
length(noPeakBind_genes),
" genes have not been annotated by nearestGene mode")
message(msg)
if (verbose){
message(
">> using distanceToNearest to find nearest peak of these genes...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
gene_start <- resize(gene_location_inChr, width = 1, fix = "start")
gene_GR <- subset(gene_start, gene_id %in% noPeakBind_genes)
nearestHit <- suppressWarnings(distanceToNearest(
gene_GR,
peak_GR, ...
))
left_anno <- tibble::tibble(
feature_id = peak_GR[subjectHits(nearestHit)]$feature_id,
gene_id = gene_GR[queryHits(nearestHit)]$gene_id,
distanceToTSS_abs = mcols(nearestHit)$distance
)
if (verbose){
message(
">> merging all anno...\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
final_anno <- rbind(
mmAnno_inputGenes,
left_anno
)
if (verbose){
message(
">> done\t\t",
format(Sys.time(), "%Y-%m-%d %X")
)
}
return(final_anno)
}
check_peakGR <- function(peak_GR, Txdb){
# the otuput class of mm_nearestGene and mm_geneScan are all GRanges
# which can be input of function again and again
# so I write below to forbid
if ("mmAnno_mode" %in% names(metadata(peak_GR))) {
stop("it seems you have done mmAnno before, please not do again",
call. = FALSE
)
return(peak_GR)
}
# check peak GR with feature_id is most important in all analysis
check_colnames("feature_id", peak_GR)
# warning will appear firstly
withr::local_options(list(warn = 1))
check_seqlevel(peak_GR, Txdb)
peak_GR <- peak_GR[seqnames(peak_GR) %in% seqlevels(Txdb)]
return(peak_GR)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.