#' This function for host gene of SJ
#'
#' @rdname HostGeneOfSJ
#' @name HostGeneOfSJ
#' @title HostGeneOfSJ
#'
#' @param object an ICASDataSet
#' @param sjRanges GRanges-class of splice junctions or introns.
#' @param gtfFile The file.path of GTF file when you used in STAR allignment.
#' @param NT The number of cores in multithreaded parallel computing.
#'
#' @importFrom rtracklayer readGFF
#' @import GenomicFeatures
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors nrow
#' @importFrom S4Vectors DataFrame
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom data.table setkey
#' @importFrom GenomicFeatures makeTxDbFromGFF
#' @importFrom GenomicFeatures transcripts
#' @importFrom GenomicFeatures intronsByTranscript
#' @importFrom S4Vectors mcols
#' @importFrom parallel mcmapply
#' @importFrom S4Vectors runValue
#' @importFrom GenomeInfoDb seqnames
#' @importFrom BiocGenerics start
#' @importFrom BiocGenerics end
#' @importFrom BiocGenerics strand
#' @importFrom GenomicFeatures genes
#' @importFrom GenomicRanges GRangesList
#' @importFrom S4Vectors subjectHits
#' @importFrom GenomicRanges findOverlaps
#' @export
#' @author Tang Chao
HostGeneOfSJ <- function(sjRanges, gtfFile, NT = 1){
if(!is(sjRanges, "GRanges"))
stop("The sjRanges object must be a GRanges data")
if(!file.exists(gtfFile))
stop("The GTF file does not exist! And you should provide the GTF file you used in STAR alignment.")
if(!is.numeric(NT)) {
stop("NT must be positive integer")
}
if(NT <= 0) {
stop("NT must be positive integer")
}
NT <- as.integer(NT)
gtf <- data.table::as.data.table(rtracklayer::readGFF(gtfFile))
gene_biotype_index <- colnames(gtf)[grepl("gene", colnames(gtf)) & grepl("type", colnames(gtf))]
sc <- c("gene_id", "gene_name", gene_biotype_index)
gene_info <- unique(gtf[, ..sc])
colnames(gene_info) <- c("gene_id", "gene_name", "biotype")
data.table::setkey(gene_info, gene_id)
# gene id ---------------------------------
txdb <- suppressWarnings(suppressMessages(GenomicFeatures::makeTxDbFromGFF(file = gtfFile, format = "gtf")))
tx <- GenomicFeatures::transcripts(txdb, columns = c("gene_id", "tx_name"), filter = NULL, use.names = TRUE)
it_by_tx <- unlist(GenomicFeatures::intronsByTranscript(x = txdb, use.names = TRUE))
tx_SpliceSite <- data.table::data.table(start = with(as.data.frame(it_by_tx), paste0(seqnames, ":", start, ":", strand)),
end = with(as.data.frame(it_by_tx), paste0(seqnames, ":", end, ":", strand)),
tx_id = names(it_by_tx))
tx_SpliceSite <- data.table::melt(tx_SpliceSite, id = "tx_id", value.name = "SpliceSite")[, .(tx_id, SpliceSite)]
tx_SpliceSite <- merge(tx_SpliceSite, S4Vectors::mcols(tx), by.x = "tx_id", by.y = "tx_name")
tx_SpliceSite$gene_id <- if(length(unlist(tx_SpliceSite$gene_id)) == nrow(tx_SpliceSite)) unlist(tx_SpliceSite$gene_id) else mcmapply(paste, tx_SpliceSite$gene_id, collapse = ";", mc.cores = NT)
tx_SpliceSite <- data.table::as.data.table(tx_SpliceSite)
tx_SpliceSite[, tx_id := NULL]
tx_SpliceSite <- unique(tx_SpliceSite)
data.table::setkey(tx_SpliceSite, SpliceSite)
parallel::mcmapply(seq_along(sjRanges), FUN = function(i) {
g1 <- tx_SpliceSite[with(as.data.frame(sjRanges[i]), paste0(seqnames, ":", start, ":", strand)), gene_id]
g2 <- tx_SpliceSite[with(as.data.frame(sjRanges[i]), paste0(seqnames, ":", end, ":", strand)), gene_id]
g1 <- if(all(is.na(g1))) NULL else g1
g2 <- if(all(is.na(g2))) NULL else g2
if(is.null(g1) & is.null(g2)) {
ge <- NA
}
if(!is.null(g1) & is.null(g2)) {
if(length(g1) == 1) {
ge <- g1
} else {
ge <- NA
}
}
if(is.null(g1) & !is.null(g2)) {
if(length(g2) == 1) {
ge <- g2
} else {
ge <- NA
}
}
if(!is.null(g1) & !is.null(g2)) {
if(length(intersect(g1, g2)) == 1) {
ge <- intersect(g1, g2)
} else {
if(length(g1) == 1 & length(g2) == 1) {
if(runValue(strand(sjRanges[i])) == "+") {
ge <- paste(g1, g2, sep = "-") # fusion
} else {
ge <- paste(g2, g1, sep = "-") # fusion
}
} else {
ge <- NA
}
}
}
return(ge)
}, mc.cores = NT) -> sj_ge
sjRanges$gene_id <- sj_ge
# mapRangesToIds ----
txdb_ge <- GenomicFeatures::genes(txdb, columns = "gene_id")
sjRanges1 <- sjRanges[is.na(sjRanges$gene_id), ]
sjRangesList <- GenomicRanges::GRangesList(lapply(seq_along(sjRanges1), FUN = function(x) sjRanges1[x]))
names(sjRangesList) <- sjRanges1$ID
parallel::mcmapply(seq_along(sjRangesList), FUN = function(i) {
Hits <- S4Vectors::subjectHits(GenomicRanges::findOverlaps(sjRangesList[[i]], txdb_ge, type = "within"))
if(length(Hits) == 1) {
txdb_ge$gene_id[Hits]
} else {
NA
}
}, mc.cores = NT) -> sjRanges[is.na(sjRanges$gene_id), ]$gene_id
# gene_name -------------------------------
parallel::mcmapply(sjRanges$gene_id, FUN = function(x) {
if(is.na(x)) {
NA
} else {
i <- strsplit(x, split = "[;-]")
gene_info[i, paste(gene_name, collapse = "-")]
}
}, mc.cores = NT) -> sjRanges$gene_name
# biotype -------------------------------
parallel::mcmapply(sjRanges$gene_id, FUN = function(x) {
if(is.na(x)) {
NA
} else {
i <- strsplit(x, split = "[;-]")
gene_info[i, paste(biotype, collapse = "-")]
}
}, mc.cores = NT) -> sjRanges$biotype
# return host gene ------------------------
return(sjRanges)
}
#' @rdname HostGeneOfSJ
#' @name HostGeneIdentify
#' @title HostGeneIdentify
#'
#' @export
HostGeneIdentify <- function(object, gtfFile, NT = 1) {
if(!is(object, "ICASDataSet"))
stop("The object must be a ICASDataSet data")
if(!file.exists(gtfFile))
stop("The GTF file does not exist! And you should provide the GTF file you used in STAR alignment.")
if(!is.numeric(NT)) {
stop("NT must be positive integer")
}
if(NT <= 0) {
stop("NT must be positive integer")
}
NT <- as.integer(NT)
if(nrow(psi(object)) == 0) {
gr <- as(row.names(counts(object)), "GRanges")
} else {
gr <- as(row.names(psi(object)), "GRanges")
}
object@HostGene <- HostGeneOfSJ(sjRanges = gr, gtfFile = gtfFile, NT = NT)
return(object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.