R/ASTypeOfASSJ.R

Defines functions ASTypeOfASSJ

Documented in ASTypeOfASSJ

#' This function for AS-type of ASSJ
#'
#' @rdname ASTypeOfASSJ
#' @name ASTypeOfASSJ
#' @title ASTypeOfASSJ
#'
#' @param object an ICASDataSet
#' @param gtfFile The file.path of GTF file when you used in STAR allignment.
#' @param NT The number of cores in multithreaded parallel computing.
#'
#' @importFrom rtracklayer readGFF
#' @import GenomicFeatures
#' @importFrom IRanges IRanges
#'
#' @export
#' @author Tang Chao

ASTypeOfASSJ <- function(object, gtfFile, NT = 1){
  if(!is(object, "ICASDataSet"))
    stop("The object must be a ICASDataSet data")

  if(!is.numeric(NT)) {
    stop("NT must be positive integer")
  }

  if(NT <= 0) {
    stop("NT must be positive integer")
  }

  NT <- as.integer(NT)

  if(!is.null(gtfFile)) {
    if(!file.exists(gtfFile))
      stop("The GTF file does not exist! And you should provide the GTF file you used in STAR alignment.")
  }

  if(nrow(psi(object)) == 0) {
    stop("You need run PSICalculater firstly since the object@psi are NULL")
  }


  gr <- as(row.names(psi(object)), "GRanges")
  grInfo <- data.table::as.data.table(gr)
  grInfo[, SJ := row.names(psi(object))]
  gr_redu <- GenomicRanges::reduce(gr)

  for(i in seq_along(gr_redu)) {
    grInfo[S4Vectors::queryHits(GenomicRanges::findOverlaps(gr, gr_redu[i], type = "any")), ID := paste(SJ, collapse = "@")]
  }

  grInfo$AS <- as.character(mapply(function(x) length(strsplit(x, "@")[[1]]), grInfo$ID))

  Cluster <- grInfo[as.numeric(AS) > 4, ]
  Cluster[, AS := "Cluster"]
  sjInfo <- grInfo[as.numeric(AS) <= 4, 1:5]

  message("Indexing GTF file...")
  # GTF index ===============================
  # AF/LE -----------------------------------
  suppressMessages(txdb <- GenomicFeatures::makeTxDbFromGFF(file = gtfFile, format = "gtf"))
  txintron <- GenomicFeatures::intronsByTranscript(txdb, use.names = TRUE)
  txintron <- txintron[S4Vectors::elementNROWS(txintron) > 0]
  s_s <- as.character(unlist(unique(GenomicRanges::strand(txintron))))
  s_l <- S4Vectors::elementNROWS(txintron)
  txintron_gr <- unlist(txintron)
  my.rank <- function(s, l) ifelse(s == "+", return(1:l), return(-(l:1)))
  GenomicRanges::mcols(txintron_gr)$intron_rank <- do.call(c, lapply(seq_along(s_s), function(i) my.rank(s_s[i], s_l[i])))
  GenomicRanges::mcols(txintron_gr)$tx_id <- names(txintron_gr)
  txintron_tab_rank <- data.table::as.data.table(txintron_gr)

  first_intron_tab <- txintron_tab_rank[abs(intron_rank) == 1, ]
  first_intron_tab[, junc := paste0(seqnames, ":", start, "-", end, ":", strand)]
  data.table::setkey(first_intron_tab, tx_id)

  last_intron_tab <- txintron_tab_rank[ , .SD[abs(intron_rank) == max(abs(intron_rank)), ], by = "tx_id"]
  last_intron_tab[, junc := paste0(seqnames, ":", start, "-", end, ":", strand)]
  data.table::setkey(last_intron_tab, tx_id)

  # ASS -------------------------------------
  exons <- data.table::as.data.table(unique(GenomicFeatures::exons(txdb, columns = c("gene_id", "EXONNAME"))))

  # AS type identification ==================
  message("Identifying AS type...")
  # Alternative SJ --------------------------
  same.start.sj <- sjInfo[, .SD[.N >= 2, ], by = list(seqnames, start, strand)]
  data.table::setkey(same.start.sj, seqnames, start, end, strand)
  same.start.sj.id <- same.start.sj[, .SD[, paste(seqnames, ":", start, "-", end, ":", strand, collapse = "@", sep = "")], by = c("seqnames", "start", "strand")][, V1]
  same.start.sj$ID <- rep(same.start.sj.id, same.start.sj[, .N, by = c("seqnames", "start", "strand")][, N])
  same.start.2.sj <- same.start.sj[, .SD[.N == 2, ], by = list(seqnames, start, strand)]

  same.end.sj <- sjInfo[, .SD[.N >= 2, ], by = list(seqnames, end, strand)]
  data.table::setkey(same.end.sj, seqnames, start, end, strand)
  same.end.sj.id <- same.end.sj[, .SD[, paste(seqnames, ":", start, "-", end, ":", strand, collapse = "@", sep = "")], by = c("seqnames", "end", "strand")][, V1]
  same.end.sj$ID <- rep(same.end.sj.id, same.end.sj[, .N, by = c("seqnames", "end", "strand")][, N])
  same.end.2.sj <- same.end.sj[, .SD[.N == 2, ], by = list(seqnames, end, strand)]

  same.start.2.sj2 <- same.start.2.sj[, .(end1 = min(end), end2 = max(end)), by = list(seqnames, start, strand)]
  same.end.2.sj2 <- same.end.2.sj[, .(start1 = max(start), start2 = min(start)), by = list(seqnames, end, strand)]

  data.table::setkey(same.start.2.sj2, seqnames, start, end1, end2, strand)
  data.table::setkey(same.end.2.sj2, seqnames, end, start2, start1, strand)

  # SE & MXE SJ -----------------------------

  match_nage <- same.start.2.sj2[strand=="-", same.end.2.sj2[strand!="+",], by = c("seqnames", "start", "end1", "end2", "strand")]
  match_posi <- same.start.2.sj2[strand=="+", same.end.2.sj2[strand!="-",], by = c("seqnames", "start", "end1", "end2", "strand")]
  start_end_match <- rbind(match_posi, match_nage)
  colnames(start_end_match)[colnames(start_end_match) == "seqnames"] <- c("seqnames", "seqnames1")
  start_end_match <- start_end_match[as.logical(start_end_match[, seqnames]==start_end_match[, seqnames1]), ]

  SE <- start_end_match[start==start2 & end==end2 & end1 < start1,]
  SE[, ID:=paste0(seqnames, ":", start, "-", end1, ":", strand, "@", seqnames, ":", start1, "-", end, ":", strand, "@", seqnames, ":", start, "-", end2, ":", strand)]
  SE_SJ1 <- SE[, .(seqnames, start, end1, strand, ID)]
  colnames(SE_SJ1) <- c("seqnames", "start", "end", "strand", "ID")
  SE_SJ2 <- SE[, .(seqnames, start1, end, strand, ID)]
  colnames(SE_SJ2) <- c("seqnames", "start", "end", "strand", "ID")
  SE_SJ3 <- SE[, .(seqnames, start, end2, strand, ID)]
  colnames(SE_SJ3) <- c("seqnames", "start", "end", "strand", "ID")
  SE_SJ <- rbind(SE_SJ1, SE_SJ2, SE_SJ3)
  SE_SJ[, seqnames:=as.character(seqnames)]
  SE_SJ[, start:=as.integer(start)]
  SE_SJ[, end:=as.integer(end)]
  SE_SJ[, strand:=as.character(strand)]
  SE_SJ[, ID:=as.character(ID)]
  data.table::setkey(SE_SJ, seqnames, start, end, strand)

  MXE <- start_end_match[start < end1 & end1 < start2 & start2 < end2 & end2 < start1 & start1 < end, ]
  MXE <- MXE[!MXE[, paste0(seqnames, ":", start2, "-", end2, ":", strand)] %in% sjInfo[, paste0(seqnames, ":", start, "-", end, ":", strand)], ]
  MXE[, ID:=paste0(seqnames, ":", start, "-", end1, ":", strand, "@", seqnames, ":", start2, "-", end, ":", strand, "@", seqnames, ":", start, "-", end2, ":", strand, "@", seqnames, ":", start1, "-", end, ":", strand)]
  MXE_SJ1 <- MXE[, .(seqnames, start, end1, strand, ID)]; colnames(MXE_SJ1) <- c("seqnames", "start", "end", "strand", "ID")
  MXE_SJ2 <- MXE[, .(seqnames, start2, end, strand, ID)]; colnames(MXE_SJ2) <- c("seqnames", "start", "end", "strand", "ID")
  MXE_SJ3 <- MXE[, .(seqnames, start, end2, strand, ID)]; colnames(MXE_SJ3) <- c("seqnames", "start", "end", "strand", "ID")
  MXE_SJ4 <- MXE[, .(seqnames, start1, end, strand, ID)]; colnames(MXE_SJ4) <- c("seqnames", "start", "end", "strand", "ID")
  MXE_SJ <- rbind(MXE_SJ1, MXE_SJ2, MXE_SJ3, MXE_SJ4)
  MXE_SJ[, seqnames:=as.character(seqnames)]
  MXE_SJ[, start:=as.integer(start)]
  MXE_SJ[, end:=as.integer(end)]
  MXE_SJ[, strand:=as.character(strand)]
  MXE_SJ[, ID:=as.character(ID)]
  data.table::setkey(MXE_SJ, seqnames, start, end, strand)

  #### AF/LE SJ

  SMXE_SJ <- union(SE_SJ[, paste0(seqnames, ":", start, "-", end, ":", strand)], MXE_SJ[, paste0(seqnames, ":", start, "-", end, ":", strand)])

  same.start.sj <- same.start.sj[!paste0(seqnames, ":", start, "-", end, ":", strand) %in% SMXE_SJ, ]
  same.end.sj <- same.end.sj[!paste0(seqnames, ":", start, "-", end, ":", strand) %in% SMXE_SJ, ]

  ## AF/LE of same start/end
  tmp1 <- merge(same.start.sj, unique(first_intron_tab[strand == "-", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
  same_start_afe <- same.start.sj[!ID %in% tmp1[is.na(junc), ID], ]
  tmp1 <- merge(same.start.sj, unique(last_intron_tab[strand == "+", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
  same_start_ale <- same.start.sj[!ID %in% tmp1[is.na(junc), ID], ]
  tmp1 <- merge(same.end.sj, unique(first_intron_tab[strand == "+", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
  same_end_afe <- same.end.sj[!ID %in% tmp1[is.na(junc), ID], ]
  tmp1 <- merge(same.end.sj, unique(last_intron_tab[strand == "-", c("seqnames", "start", "end", "strand", "junc")]), by = c("seqnames", "start", "end", "strand"), all.x = T)
  same_end_ale <- same.end.sj[!ID %in% tmp1[is.na(junc), ID], ]
  AFE_SJ <- rbind(same_start_afe, same_end_afe)[, c("seqnames", "start", "end", "strand", "ID")]
  ALE_SJ <- rbind(same_start_ale, same_end_ale)[, c("seqnames", "start", "end", "strand", "ID")]

  AFE_SJ[, seqnames:=as.character(seqnames)]
  AFE_SJ[, start:=as.integer(start)]
  AFE_SJ[, end:=as.integer(end)]
  AFE_SJ[, strand:=as.character(strand)]
  AFE_SJ[, ID:=as.character(ID)]
  data.table::setkey(AFE_SJ, seqnames, start, end, strand)
  ALE_SJ[, seqnames:=as.character(seqnames)]
  ALE_SJ[, start:=as.integer(start)]
  ALE_SJ[, end:=as.integer(end)]
  ALE_SJ[, strand:=as.character(strand)]
  ALE_SJ[, ID:=as.character(ID)]
  data.table::setkey(ALE_SJ, seqnames, start, end, strand)

  #### ASS SJ

  same.start.2.sj2 <- same.start.2.sj2[!paste0(seqnames, ":", start, "-", end1, ":", strand) %in% SMXE_SJ, ]
  same.start.2.sj2 <- same.start.2.sj2[!paste0(seqnames, ":", start, "-", end2, ":", strand) %in% SMXE_SJ, ]

  same.end.2.sj2 <- same.end.2.sj2[!paste0(seqnames, ":", start1, "-", end, ":", strand) %in% SMXE_SJ, ]
  same.end.2.sj2 <- same.end.2.sj2[!paste0(seqnames, ":", start2, "-", end, ":", strand) %in% SMXE_SJ, ]

  same.start.2.sj2[, ID:=paste0(seqnames, ":", start, "-", end1, ":", strand, "@", seqnames, ":", start, "-", end2, ":", strand)]
  same.end.2.sj2[, ID:=paste0(seqnames, ":", start1, "-", end, ":", strand, "@", seqnames, ":", start2, "-", end, ":", strand)]
  same_start_junc2_gr <- GenomicRanges::GRanges(seqnames = as.character(same.start.2.sj2[, seqnames]),
                                                ranges = IRanges::IRanges(start = as.integer(same.start.2.sj2[, end1]) + 1, end = as.integer(same.start.2.sj2[, end2])),
                                                strand = as.character(same.start.2.sj2[, strand]),
                                                ID = same.start.2.sj2[, ID])
  same_end_junc2_gr <- GenomicRanges::GRanges(seqnames = as.character(same.end.2.sj2[, seqnames]),
                                              ranges = IRanges::IRanges(start = as.integer(same.end.2.sj2[, start2]), end = as.integer(same.end.2.sj2[, start1]) - 1),
                                              strand = as.character(same.end.2.sj2[, strand]),
                                              ID = same.end.2.sj2[, ID])
  tmp1 <- data.table::as.data.table(same_start_junc2_gr)
  tmp3 <- data.table::as.data.table(same_end_junc2_gr)

  same_start_ass <- merge(tmp1, exons, by = c("seqnames", "start", "strand"))[end.x < end.y, ]
  same_end_ass <- merge(tmp3, exons, by = c("seqnames", "end", "strand"))[start.x > start.y, ]
  A3SS_junc <- unique(c(same_start_ass[strand == "+", ID], same_end_ass[strand == "-", ID]))
  A5SS_junc <- unique(c(same_start_ass[strand == "-", ID], same_end_ass[strand == "+", ID]))

  A3SS_SJ <- lapply(strsplit(stringr::str_replace_all(A3SS_junc, "-(?=[[:digit:]])", ":"), "[@:]"), function(x){
    matrix(x, ncol = 4, byrow = T, dimnames = list(NULL, c("seqnames", "start", "end", "strand")))
  })
  A3SS_SJ <- data.table::as.data.table(do.call(rbind, A3SS_SJ))
  A3SS_SJ$ID <- rep(A3SS_junc, each = 2)

  A5SS_SJ <- lapply(strsplit(stringr::str_replace_all(A5SS_junc, "-(?=[[:digit:]])", ":"), "[@:]"), function(x){
    matrix(x, ncol = 4, byrow = T, dimnames = list(NULL, c("seqnames", "start", "end", "strand")))
  })
  A5SS_SJ <- data.table::as.data.table(do.call(rbind, A5SS_SJ))
  A5SS_SJ$ID <- rep(A5SS_junc, each = 2)

  if(nrow(A3SS_SJ) > 0) {
    A3SS_SJ[, seqnames:=as.character(seqnames)]
    A3SS_SJ[, start:=as.integer(start)]
    A3SS_SJ[, end:=as.integer(end)]
    A3SS_SJ[, strand:=as.character(strand)]
    A3SS_SJ[, ID:=as.character(ID)]
    data.table::setkey(A3SS_SJ, seqnames, start, end, strand)
    A3SS_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
  }

  if(nrow(A5SS_SJ) > 0) {
    A5SS_SJ[, seqnames:=as.character(seqnames)]
    A5SS_SJ[, start:=as.integer(start)]
    A5SS_SJ[, end:=as.integer(end)]
    A5SS_SJ[, strand:=as.character(strand)]
    A5SS_SJ[, ID:=as.character(ID)]
    data.table::setkey(A5SS_SJ, seqnames, start, end, strand)
    A5SS_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
  }

  SE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
  MXE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
  AFE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
  ALE_SJ[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]

  SE_SJ[, AS := "SE"]
  MXE_SJ[, AS := "MXE"]
  AFE_SJ[, AS := "AFE"]
  ALE_SJ[, AS := "ALE"]
  A3SS_SJ[, AS := "A3SS"]
  A5SS_SJ[, AS := "A5SS"]

  AFLE <- rbind(AFE_SJ, ALE_SJ)
  AFLE$ID2 <- mapply(FUN = function(x) {paste0(sort(unlist(strsplit(x, "@"))), collapse = "@")}, AFLE$ID)

  A35SS <- rbind(A3SS_SJ, A5SS_SJ)
  A35SS$ID2 <- mapply(FUN = function(x) {paste0(sort(unlist(strsplit(x, "@"))), collapse = "@")}, A35SS$ID)

  A35SS <- A35SS[!ID2 %in% AFLE$ID2, ]
  AFLE[, ID2 := NULL]
  A35SS[, ID2 := NULL]

  if(nrow(SE_SJ) == 0) {
    SE_SJ <- NULL
  }

  if(nrow(MXE_SJ) == 0) {
    MXE_SJ <- NULL
  }

  if(nrow(AFLE) == 0) {
    AFLE <- NULL
  }

  if(nrow(A35SS) == 0) {
    A35SS <- NULL
  }

  AS_SJ <- rbind(SE_SJ, MXE_SJ, AFLE, A35SS) # 属于多个事件的 SJ 做成 Cluster,没有 assign 到事件的 SJ 做成 Unknown 或者 Cluster
  data.table::setkey(AS_SJ, SJ)

  cluster <- mapply(AS_SJ[, .N, by = "SJ"][N > 1, SJ], FUN = function(x) AS_SJ[x, paste0(AS, collapse = "; ")])

  if(length(cluster) != 0) {
    unknown <- Reduce(union, lapply(names(cluster), function(x) grep(pattern = x, AS_SJ$ID)))
    AS_SJ <- AS_SJ[-c(unknown), ]
  }

  sjInfo[, SJ := paste0(seqnames, ":", start, "-", end, ":", strand)]
  unknown <- sjInfo[!SJ %in% AS_SJ$SJ, ]
  unknown <- merge(unknown, grInfo[, .(SJ, ID)], by = "SJ")
  unknown$AS <- "Unknown"

  all_res <- rbind(Cluster[, .(SJ, AS, ID)], AS_SJ[, .(SJ, AS, ID)], unknown[, .(SJ, AS, ID)])

  # return AS_Type

  res_gr <- as(all_res$SJ, "GRanges")
  res_gr$AS <- all_res$AS
  res_gr$ID <- all_res$ID

  object@ASType <- sort(res_gr)
  return(object)
}
tangchao7498/ICAS documentation built on Jan. 28, 2021, 3:56 p.m.