#' This function for AS-type of ASSJ
#'
#' @rdname ASTypeOfASSJ
#' @name ASTypeOfASSJ
#' @title ASTypeOfASSJ
#'
#' @param object an ICASDataSet
#' @param gtfFile The file.path of GTF file when you used in STAR allignment.
#' @param NT The number of cores in multithreaded parallel computing.
#'
#' @importFrom rtracklayer readGFF
#' @import GenomicFeatures
#' @importFrom IRanges IRanges
#'
#' @export
#' @author Tang Chao
ASTypeOfASSJ <- function(object, gtfFile, NT = 1){
if(!is(object, "ICASDataSet"))
stop("The object must be a ICASDataSet data")
if(!is.numeric(NT)) {
stop("NT must be positive integer")
}
if(NT <= 0) {
stop("NT must be positive integer")
}
NT <- as.integer(NT)
if(!is.null(gtfFile)) {
if(!file.exists(gtfFile))
stop("The GTF file does not exist! And you should provide the GTF file you used in STAR alignment.")
}
if(nrow(psi(object)) == 0) {
stop("You need run PSICalculater firstly since the object@psi are NULL")
}
gr <- as(row.names(psi(object)), "GRanges")
grInfo <- data.table::as.data.table(gr)
grInfo[, SJ := row.names(psi(object))]
gr_redu <- GenomicRanges::reduce(gr)
for(i in seq_along(gr_redu)) {
grInfo[S4Vectors::queryHits(GenomicRanges::findOverlaps(gr, gr_redu[i], type = "any")), ID := paste(SJ, collapse = "@")]
}
grInfo$AS <- as.character(mapply(function(x) length(strsplit(x, "@")[[1]]), grInfo$ID))
Cluster <- grInfo[as.numeric(AS) > 4, ]
Cluster[, AS := "Cluster"]
sjInfo <- grInfo[as.numeric(AS) <= 4, 1:5]
message("Indexing GTF file...")
# GTF index ===============================
# AF/LE -----------------------------------
suppressMessages(txdb <- GenomicFeatures::makeTxDbFromGFF(file = gtfFile, format = "gtf"))
txintron <- GenomicFeatures::intronsByTranscript(txdb, use.names = TRUE)
txintron <- txintron[S4Vectors::elementNROWS(txintron) > 0]
s_s <- as.character(unlist(unique(GenomicRanges::strand(txintron))))
s_l <- S4Vectors::elementNROWS(txintron)
txintron_gr <- unlist(txintron)
my.rank <- function(s, l) ifelse(s == "+", return(1:l), return(-(l:1)))
GenomicRanges::mcols(txintron_gr)$intron_rank <- do.call(c, lapply(seq_along(s_s), function(i) my.rank(s_s[i], s_l[i])))
GenomicRanges::mcols(txintron_gr)$tx_id <- names(txintron_gr)
txintron_tab_rank <- data.table::as.data.table(txintron_gr)
first_intron_tab <- txintron_tab_rank[abs(intron_rank) == 1, ]
first_intron_tab[, junc := paste0(seqnames, ":", start, "-", end, ":", strand)]
data.table::setkey(first_intron_tab, tx_id)
last_intron_tab <- txintron_tab_rank[ , .SD[abs(intron_rank) == max(abs(intron_rank)), ], by = "tx_id"]
last_intron_tab[, junc := paste0(seqnames, ":", start, "-", end, ":", strand)]
data.table::setkey(last_intron_tab, tx_id)
# ASS -------------------------------------
exons <- data.table::as.data.table(unique(GenomicFeatures::exons(txdb, columns = c("gene_id", "EXONNAME"))))
# AS type identification ==================
message("Identifying AS type...")
# Alternative SJ --------------------------
same.start.sj <- sjInfo[, .SD[.N >= 2, ], by = list(seqnames, start, strand)]
data.table::setkey(same.start.sj, seqnames, start, end, strand)
same.start.sj.id <- same.start.sj[, .SD[, paste(seqnames, ":", start, "-", end, ":", strand, collapse = "@", sep = "")], by = c("seqnames", "start", "strand")][, V1]
same.start.sj$ID <- rep(same.start.sj.id, same.start.sj[, .N, by = c("seqnames", "start", "strand")][, N])
same.start.2.sj <- same.start.sj[, .SD[.N == 2, ], by = list(seqnames, start, strand)]
same.end.sj <- sjInfo[, .SD[.N >= 2, ], by = list(seqnames, end, strand)]
data.table::setkey(same.end.sj, seqnames, start, end, strand)
same.end.sj.id <- same.end.sj[, .SD[, paste(seqnames, ":", start, "-", end, ":", strand, collapse = "@", sep = "")], by = c("seqnames", "end", "strand")][, V1]
same.end.sj$ID <- rep(same.end.sj.id, same.end.sj[, .N, by = c("seqnames", "end", "strand")][, N])
same.end.2.sj <- same.end.sj[, .SD[.N == 2, ], by = list(seqnames, end, strand)]
same.start.2.sj2 <- same.start.2.sj[, .(end1 = min(end), end2 = max(end)), by = list(seqnames, start, strand)]
same.end.2.sj2 <- same.end.2.sj[, .(start1 = max(start), start2 = min(start)), by = list(seqnames, end, strand)]
data.table::setkey(same.start.2.sj2, seqnames, start, end1, end2, strand)
data.table::setkey(same.end.2.sj2, seqnames, end, start2, start1, strand)
# SE & MXE SJ -----------------------------
match_nage <- same.start.2.sj2[strand=="-", same.end.2.sj2[strand!="+",], by = c("seqnames", "start", "end1", "end2", "strand")]
match_posi <- same.start.2.sj2[strand=="+", same.end.2.sj2[strand!="-",], by = c("seqnames", "start", "end1", "end2", "strand")]
start_end_match <- rbind(match_posi, match_nage)
colnames(start_end_match)[colnames(start_end_match) == "seqnames"] <- c("seqnames", "seqnames1")
start_end_match <- start_end_match[as.logical(start_end_match[, seqnames]==start_end_match[, seqnames1]), ]
SE <- start_end_match[start==start2 & end==end2 & end1 < start1,]
SE[, ID:=paste0(seqnames, ":", start, "-", end1, ":", strand, "@", seqnames, ":", start1, "-", end, ":", strand, "@", seqnames, ":", start, "-", end2, ":", strand)]
SE_SJ1 <- SE[, .(seqnames, start, end1, strand, ID)]
colnames(SE_SJ1) <- c("seqnames", "start", "end", "strand", "ID")
SE_SJ2 <- SE[, .(seqnames, start1, end, strand, ID)]
colnames(SE_SJ2) <- c("seqnames", "start", "end", "strand", "ID")
SE_SJ3 <- SE[, .(seqnames, start, end2, strand, ID)]
colnames(SE_SJ3) <- c("seqnames", "start", "end", "strand", "ID")
SE_SJ <- rbind(SE_SJ1, SE_SJ2, SE_SJ3)
SE_SJ[, seqnames:=as.character(seqnames)]
SE_SJ[, start:=as.integer(start)]
SE_SJ[, end:=as.integer(end)]
SE_SJ[, strand:=as.character(strand)]
SE_SJ[, ID:=as.character(ID)]
data.table::setkey(SE_SJ, seqnames, start, end, strand)
MXE <- start_end_match[start < end1 & end1 < start2 & start2 < end2 & end2 < start1 & start1 < end, ]
MXE <- MXE[!MXE[, paste0(seqnames, ":", start2, "-", end2, ":", strand)] %in% sjInfo[, paste0(seqnames, ":", start, "-", end, ":", strand)], ]
MXE[, ID:=paste0(seqnames, ":", start, "-", end1, ":", strand, "@", seqnames, ":", start2, "-", end, ":", strand, "@", seqnames, ":", start, "-", end2, ":", strand, "@", seqnames, ":", start1, "-", end, ":", strand)]
MXE_SJ1 <- MXE[, .(seqnames, start, end1, strand, ID)]; colnames(MXE_SJ1) <- c("seqnames", "start", "end", "strand", "ID")
MXE_SJ2 <- MXE[, .(seqnames, start2, end, strand, ID)]; colnames(MXE_SJ2) <- c("seqnames", "start", "end", "strand", "ID")
MXE_SJ3 <- MXE[, .(seqnames, start, end2, strand, ID)]; colnames(MXE_SJ3) <- c("seqnames", "start", "end", "strand", "ID")
MXE_SJ4 <- MXE[, .(seqnames, start1, end, strand, ID)]; colnames(MXE_SJ4) <- c("seqnames", "start", "end", "strand", "ID")
MXE_SJ <- rbind(MXE_SJ1, MXE_SJ2, MXE_SJ3, MXE_SJ4)
MXE_SJ[, seqnames:=as.character(seqnames)]
MXE_SJ[, start:=as.integer(start)]
MXE_SJ[, end:=as.integer(end)]
MXE_SJ[, strand:=as.character(strand)]
MXE_SJ[, ID:=as.character(ID)]
data.table::setkey(MXE_SJ, seqnames, start, end, strand)
#### AF/LE SJ
SMXE_SJ <- union(SE_SJ[, paste0(seqnames, ":", start, "-", end, ":", strand)], MXE_SJ[, paste0(seqnames, ":", start, "-", end, ":", strand)])
same.start.sj <- same.start.sj[!paste0(seqnames, ":", start, "-", end, ":", strand) %in% SMXE_SJ, ]
same.end.sj <- same.end.sj[!paste0(seqnames, ":", start, "-", end, ":", strand) %in% SMXE_SJ, ]
## AF/LE of same start/end
tmp1 <- merge(same.start.sj, unique(first_intron_tab[strand == "-", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
same_start_afe <- same.start.sj[!ID %in% tmp1[is.na(junc), ID], ]
tmp1 <- merge(same.start.sj, unique(last_intron_tab[strand == "+", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
same_start_ale <- same.start.sj[!ID %in% tmp1[is.na(junc), ID], ]
tmp1 <- merge(same.end.sj, unique(first_intron_tab[strand == "+", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
same_end_afe <- same.end.sj[!ID %in% tmp1[is.na(junc), ID], ]
tmp1 <- merge(same.end.sj, unique(last_intron_tab[strand == "-", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
same_end_ale <- same.end.sj[!ID %in% tmp1[is.na(junc), ID], ]
AFE_SJ <- rbind(same_start_afe, same_end_afe)[, c("seqnames", "start", "end", "strand", "ID")]
ALE_SJ <- rbind(same_start_ale, same_end_ale)[, c("seqnames", "start", "end", "strand", "ID")]
AFE_SJ[, seqnames:=as.character(seqnames)]
AFE_SJ[, start:=as.integer(start)]
AFE_SJ[, end:=as.integer(end)]
AFE_SJ[, strand:=as.character(strand)]
AFE_SJ[, ID:=as.character(ID)]
data.table::setkey(AFE_SJ, seqnames, start, end, strand)
ALE_SJ[, seqnames:=as.character(seqnames)]
ALE_SJ[, start:=as.integer(start)]
ALE_SJ[, end:=as.integer(end)]
ALE_SJ[, strand:=as.character(strand)]
ALE_SJ[, ID:=as.character(ID)]
data.table::setkey(ALE_SJ, seqnames, start, end, strand)
#### ASS SJ
same.start.2.sj2 <- same.start.2.sj2[!paste0(seqnames, ":", start, "-", end1, ":", strand) %in% SMXE_SJ, ]
same.start.2.sj2 <- same.start.2.sj2[!paste0(seqnames, ":", start, "-", end2, ":", strand) %in% SMXE_SJ, ]
same.end.2.sj2 <- same.end.2.sj2[!paste0(seqnames, ":", start1, "-", end, ":", strand) %in% SMXE_SJ, ]
same.end.2.sj2 <- same.end.2.sj2[!paste0(seqnames, ":", start2, "-", end, ":", strand) %in% SMXE_SJ, ]
same.start.2.sj2[, ID:=paste0(seqnames, ":", start, "-", end1, ":", strand, "@", seqnames, ":", start, "-", end2, ":", strand)]
same.end.2.sj2[, ID:=paste0(seqnames, ":", start1, "-", end, ":", strand, "@", seqnames, ":", start2, "-", end, ":", strand)]
same_start_junc2_gr <- GenomicRanges::GRanges(seqnames = as.character(same.start.2.sj2[, seqnames]),
ranges = IRanges::IRanges(start = as.integer(same.start.2.sj2[, end1]) + 1, end = as.integer(same.start.2.sj2[, end2])),
strand = as.character(same.start.2.sj2[, strand]),
ID = same.start.2.sj2[, ID])
same_end_junc2_gr <- GenomicRanges::GRanges(seqnames = as.character(same.end.2.sj2[, seqnames]),
ranges = IRanges::IRanges(start = as.integer(same.end.2.sj2[, start2]), end = as.integer(same.end.2.sj2[, start1]) - 1),
strand = as.character(same.end.2.sj2[, strand]),
ID = same.end.2.sj2[, ID])
tmp1 <- data.table::as.data.table(same_start_junc2_gr)
tmp3 <- data.table::as.data.table(same_end_junc2_gr)
same_start_ass <- merge(tmp1, exons, by = c("seqnames", "start", "strand"))[end.x < end.y, ]
same_end_ass <- merge(tmp3, exons, by = c("seqnames", "end", "strand"))[start.x > start.y, ]
A3SS_junc <- unique(c(same_start_ass[strand == "+", ID], same_end_ass[strand == "-", ID]))
A5SS_junc <- unique(c(same_start_ass[strand == "-", ID], same_end_ass[strand == "+", ID]))
A3SS_SJ <- lapply(strsplit(stringr::str_replace_all(A3SS_junc, "-(?=[[:digit:]])", ":"), "[@:]"), function(x){
matrix(x, ncol = 4, byrow = T, dimnames = list(NULL, c("seqnames", "start", "end", "strand")))
})
A3SS_SJ <- data.table::as.data.table(do.call(rbind, A3SS_SJ))
A3SS_SJ$ID <- rep(A3SS_junc, each = 2)
A5SS_SJ <- lapply(strsplit(stringr::str_replace_all(A5SS_junc, "-(?=[[:digit:]])", ":"), "[@:]"), function(x){
matrix(x, ncol = 4, byrow = T, dimnames = list(NULL, c("seqnames", "start", "end", "strand")))
})
A5SS_SJ <- data.table::as.data.table(do.call(rbind, A5SS_SJ))
A5SS_SJ$ID <- rep(A5SS_junc, each = 2)
if(nrow(A3SS_SJ) > 0) {
A3SS_SJ[, seqnames:=as.character(seqnames)]
A3SS_SJ[, start:=as.integer(start)]
A3SS_SJ[, end:=as.integer(end)]
A3SS_SJ[, strand:=as.character(strand)]
A3SS_SJ[, ID:=as.character(ID)]
data.table::setkey(A3SS_SJ, seqnames, start, end, strand)
A3SS_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
}
if(nrow(A5SS_SJ) > 0) {
A5SS_SJ[, seqnames:=as.character(seqnames)]
A5SS_SJ[, start:=as.integer(start)]
A5SS_SJ[, end:=as.integer(end)]
A5SS_SJ[, strand:=as.character(strand)]
A5SS_SJ[, ID:=as.character(ID)]
data.table::setkey(A5SS_SJ, seqnames, start, end, strand)
A5SS_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
}
SE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
MXE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
AFE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
ALE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
SE_SJ[, AS := "SE"]
MXE_SJ[, AS := "MXE"]
AFE_SJ[, AS := "AFE"]
ALE_SJ[, AS := "ALE"]
A3SS_SJ[, AS := "A3SS"]
A5SS_SJ[, AS := "A5SS"]
AFLE <- rbind(AFE_SJ, ALE_SJ)
AFLE$ID2 <- mapply(FUN = function(x) {paste0(sort(unlist(strsplit(x, "@"))), collapse = "@")}, AFLE$ID)
A35SS <- rbind(A3SS_SJ, A5SS_SJ)
A35SS$ID2 <- mapply(FUN = function(x) {paste0(sort(unlist(strsplit(x, "@"))), collapse = "@")}, A35SS$ID)
A35SS <- A35SS[!ID2 %in% AFLE$ID2, ]
AFLE[, ID2 := NULL]
A35SS[, ID2 := NULL]
if(nrow(SE_SJ) == 0) {
SE_SJ <- NULL
}
if(nrow(MXE_SJ) == 0) {
MXE_SJ <- NULL
}
if(nrow(AFLE) == 0) {
AFLE <- NULL
}
if(nrow(A35SS) == 0) {
A35SS <- NULL
}
AS_SJ <- rbind(SE_SJ, MXE_SJ, AFLE, A35SS) # 属于多个事件的 SJ 做成 Cluster,没有 assign 到事件的 SJ 做成 Unknown 或者 Cluster
data.table::setkey(AS_SJ, SJ)
cluster <- mapply(AS_SJ[, .N, by = "SJ"][N > 1, SJ], FUN = function(x) AS_SJ[x, paste0(AS, collapse = "; ")])
if(length(cluster) != 0) {
unknown <- Reduce(union, lapply(names(cluster), function(x) grep(pattern = x, AS_SJ$ID)))
AS_SJ <- AS_SJ[-c(unknown), ]
}
sjInfo[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
unknown <- sjInfo[!SJ %in% AS_SJ$SJ, ]
unknown <- merge(unknown, grInfo[, .(SJ, ID)], by = "SJ")
unknown$AS <- "Unknown"
all_res <- rbind(Cluster[, .(SJ, AS, ID)], AS_SJ[, .(SJ, AS, ID)], unknown[, .(SJ, AS, ID)])
# return AS_Type
res_gr <- as(all_res$SJ, "GRanges")
res_gr$AS <- all_res$AS
res_gr$ID <- all_res$ID
object@ASType <- sort(res_gr)
return(object)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.