R/countReads.R

Defines functions countReads

Documented in countReads

#' Count and summarize long-reads RNA into a table
#'
#' A function that ouputs two tsv files containing counts of exon combinations
#'
#' @param alnsFile The file that store the long-read RNA alignments generated by Minimap2.
#' @param referencesFile A file that store reference coordinates of human genes.
#'
#' @return A list of two data frames. Fisrt data frame contains new exon coordinates
#' created and second data frame contains the exon combinations of each read.
#'
#' @examples
#' alignments <- system.file("extdata", "mlx_reads.sorted.bam",
#' package = "LSplicing")
#' referenceCoord <- system.file("extdata", "example_refCoord.gff3",
#' package = "LSplicing")
#'
#' \dontrun{
#' countReads(alns = alignments, refCoord = referenceCoord)
#' }
#' @export
#' @import GenomicRanges
#' @import GenomicAlignments
#' @import SummarizedExperiment
#' @import rtracklayer
#' @importFrom plyr rbind.fill
#' @importFrom utils write.table

countReads <- function(alnsFile, referencesFile) {
  if (! (file.exists(alnsFile) && file.exists(referencesFile))) {
    stop("No such file, please check the path to files")
  }

  # read a BAM file into a GAlignment object
  alns <- GenomicAlignments::readGAlignments(alnsFile)

  # read a BED file into a GRanges Object
  refCoord <- rtracklayer::import(referencesFile)

  # store results of exon combinations
  results <- data.frame()
  # store results of new exon coordinates
  gList <- data.frame()

  # Extract gene in refCoord
  idx <- refCoord@elementMetadata@listData[["type"]] == "gene"
  genes <- refCoord[idx]

  # loop through every alignment and map them to according gene exons
  for (k in seq_along(alns)) {
    gene <- data.frame(union = SummarizedExperiment::assay(GenomicAlignments::summarizeOverlaps(genes, alns[k], inter.feature=FALSE)))

    # If this read maps to more than one gene, then skip this read
    if (sum(gene$reads) == 1) {
      (matchIdx <- which(gene[,1] > 0))
      genes[matchIdx]

      # find ensembl gene ID of this read
      geneId <- genes[matchIdx]@elementMetadata@listData[["gene_id"]]

      # filter refCoord
      dup <- sort(refCoord[refCoord@elementMetadata@listData[["gene_id"]] == geneId &
                             refCoord@elementMetadata@listData[["type"]] == "exon" &
                             refCoord@elementMetadata@listData[["transcript_type"]] == "protein_coding"])

      # generate new exon coordinates
      combined <- combineTranscripts(gList, dup, geneId)
      newGranges <- combined[[1]]
      gList <- combined[[2]]

      # map read to new exon coordinates
      read <- data.frame(union = SummarizedExperiment::assay(GenomicAlignments::summarizeOverlaps(newGranges,
                                                                                                  alns[k], inter.feature=FALSE), byrow = TRUE))
      sum(read$reads)

      # Transpose the data frame
      t <- t(read)
      colnames(t) <- paste(rep("exon", ncol(t)), sep = "", 1:ncol(t))
      newDf <- cbind(data.frame("index" = k, geneId), t)

      # Assign row name
      ## use plyr::rbind.fill(x,y) to bind data frames
      results <- plyr::rbind.fill(results, newDf)
    }
  }

  r <- list(gList, results)

  return(r)
}



# [END]
leetina4/LSplicing documentation built on Dec. 8, 2019, 1:34 p.m.