R/HostGene.R

Defines functions FUN FUN FUN FUN HostGeneOfSJ

Documented in HostGeneOfSJ

#' This function for host gene of SJ
#'
#' @rdname HostGeneOfSJ
#' @name HostGeneOfSJ
#' @title HostGeneOfSJ
#'
#' @param object an ICASDataSet
#' @param sjRanges GRanges-class of splice junctions or introns.
#' @param gtfFile The file.path of GTF file when you used in STAR allignment.
#' @param NT The number of cores in multithreaded parallel computing.
#'
#' @importFrom rtracklayer readGFF
#' @import GenomicFeatures
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors nrow
#' @importFrom S4Vectors DataFrame
#' @importFrom data.table data.table
#' @importFrom data.table as.data.table
#' @importFrom data.table setkey
#' @importFrom GenomicFeatures makeTxDbFromGFF
#' @importFrom GenomicFeatures transcripts
#' @importFrom GenomicFeatures intronsByTranscript
#' @importFrom S4Vectors mcols
#' @importFrom parallel mcmapply
#' @importFrom S4Vectors runValue
#' @importFrom GenomeInfoDb seqnames
#' @importFrom BiocGenerics start
#' @importFrom BiocGenerics end
#' @importFrom BiocGenerics strand
#' @importFrom GenomicFeatures genes
#' @importFrom GenomicRanges GRangesList
#' @importFrom S4Vectors subjectHits
#' @importFrom GenomicRanges findOverlaps
#' @export
#' @author Tang Chao

HostGeneOfSJ <- function(sjRanges, gtfFile, NT = 1){
  if(!is(sjRanges, "GRanges"))
    stop("The sjRanges object must be a GRanges data")
  if(!file.exists(gtfFile))
    stop("The GTF file does not exist! And you should provide the GTF file you used in STAR alignment.")

  if(!is.numeric(NT)) {
    stop("NT must be positive integer")
  }

  if(NT <= 0) {
    stop("NT must be positive integer")
  }

  NT <- as.integer(NT)

  gtf <- data.table::as.data.table(rtracklayer::readGFF(gtfFile))
  gene_biotype_index <- colnames(gtf)[grepl("gene", colnames(gtf)) & grepl("type", colnames(gtf))]

  sc <- c("gene_id", "gene_name", gene_biotype_index)
  gene_info <- unique(gtf[, ..sc])
  colnames(gene_info) <- c("gene_id", "gene_name", "biotype")
  data.table::setkey(gene_info, gene_id)

  # gene id ---------------------------------
  txdb <- suppressWarnings(suppressMessages(GenomicFeatures::makeTxDbFromGFF(file = gtfFile, format = "gtf")))
  tx <- GenomicFeatures::transcripts(txdb, columns = c("gene_id", "tx_name"), filter = NULL, use.names = TRUE)

  it_by_tx <- unlist(GenomicFeatures::intronsByTranscript(x = txdb, use.names = TRUE))
  tx_SpliceSite <- data.table::data.table(start = with(as.data.frame(it_by_tx), paste0(seqnames, ":", start, ":", strand)),
                                          end = with(as.data.frame(it_by_tx), paste0(seqnames, ":", end, ":", strand)),
                                          tx_id = names(it_by_tx))
  tx_SpliceSite <- data.table::melt(tx_SpliceSite, id = "tx_id", value.name = "SpliceSite")[, .(tx_id, SpliceSite)]
  tx_SpliceSite <- merge(tx_SpliceSite, S4Vectors::mcols(tx), by.x = "tx_id", by.y = "tx_name")
  tx_SpliceSite$gene_id <- if(length(unlist(tx_SpliceSite$gene_id)) == nrow(tx_SpliceSite)) unlist(tx_SpliceSite$gene_id) else mcmapply(paste, tx_SpliceSite$gene_id, collapse = ";", mc.cores = NT)
  tx_SpliceSite <- data.table::as.data.table(tx_SpliceSite)
  tx_SpliceSite[, tx_id := NULL]
  tx_SpliceSite <- unique(tx_SpliceSite)
  data.table::setkey(tx_SpliceSite, SpliceSite)

  parallel::mcmapply(seq_along(sjRanges), FUN = function(i) {
    g1 <- tx_SpliceSite[with(as.data.frame(sjRanges[i]), paste0(seqnames, ":", start, ":", strand)), gene_id]
    g2 <- tx_SpliceSite[with(as.data.frame(sjRanges[i]), paste0(seqnames, ":", end, ":", strand)), gene_id]

    g1 <- if(all(is.na(g1))) NULL else g1
    g2 <- if(all(is.na(g2))) NULL else g2

    if(is.null(g1) & is.null(g2)) {
      ge <- NA
    }

    if(!is.null(g1) & is.null(g2)) {
      if(length(g1) == 1) {
        ge <- g1
      } else {
        ge <- NA
      }
    }

    if(is.null(g1) & !is.null(g2)) {
      if(length(g2) == 1) {
        ge <- g2
      } else {
        ge <- NA
      }
    }

    if(!is.null(g1) & !is.null(g2)) {
      if(length(intersect(g1, g2)) == 1) {
        ge <- intersect(g1, g2)
      } else {
        if(length(g1) == 1 & length(g2) == 1) {
          if(runValue(strand(sjRanges[i])) == "+") {
            ge <- paste(g1, g2, sep = "-") # fusion
          } else {
            ge <- paste(g2, g1, sep = "-") # fusion
          }
        } else {
          ge <- NA
        }
      }
    }
    return(ge)
  }, mc.cores = NT) -> sj_ge
  sjRanges$gene_id <- sj_ge

  # mapRangesToIds ----
  txdb_ge <- GenomicFeatures::genes(txdb, columns = "gene_id")

  sjRanges1 <- sjRanges[is.na(sjRanges$gene_id), ]
  sjRangesList <- GenomicRanges::GRangesList(lapply(seq_along(sjRanges1), FUN = function(x) sjRanges1[x]))
  names(sjRangesList) <- sjRanges1$ID

  parallel::mcmapply(seq_along(sjRangesList), FUN = function(i) {
    Hits <- S4Vectors::subjectHits(GenomicRanges::findOverlaps(sjRangesList[[i]], txdb_ge, type = "within"))
    if(length(Hits) == 1) {
      txdb_ge$gene_id[Hits]
    } else {
      NA
    }
  }, mc.cores = NT) -> sjRanges[is.na(sjRanges$gene_id), ]$gene_id

  # gene_name -------------------------------

  parallel::mcmapply(sjRanges$gene_id, FUN = function(x) {
    if(is.na(x)) {
      NA
    } else {
      i <- strsplit(x, split = "[;-]")
      gene_info[i, paste(gene_name, collapse = "-")]
    }
  }, mc.cores = NT) -> sjRanges$gene_name

  # biotype -------------------------------

  parallel::mcmapply(sjRanges$gene_id, FUN = function(x) {
    if(is.na(x)) {
      NA
    } else {
      i <- strsplit(x, split = "[;-]")
      gene_info[i, paste(biotype, collapse = "-")]
    }
  }, mc.cores = NT) -> sjRanges$biotype

  # return host gene ------------------------
  return(sjRanges)
}


#' @rdname HostGeneOfSJ
#' @name HostGeneIdentify
#' @title HostGeneIdentify
#'
#' @export

HostGeneIdentify <- function(object, gtfFile, NT = 1) {
  if(!is(object, "ICASDataSet"))
    stop("The object must be a ICASDataSet data")
  if(!file.exists(gtfFile))
    stop("The GTF file does not exist! And you should provide the GTF file you used in STAR alignment.")

  if(!is.numeric(NT)) {
    stop("NT must be positive integer")
  }

  if(NT <= 0) {
    stop("NT must be positive integer")
  }

  NT <- as.integer(NT)

  if(nrow(psi(object)) == 0) {
    gr <- as(row.names(counts(object)), "GRanges")
  } else {
    gr <- as(row.names(psi(object)), "GRanges")
  }
  object@HostGene <- HostGeneOfSJ(sjRanges = gr, gtfFile = gtfFile, NT = NT)
  return(object)
}
tangchao7498/ICAS documentation built on Jan. 28, 2021, 3:56 p.m.