R/MitoTrace.R

Defines functions MitoDepth calc_allele_frequency MitoTrace checkSequenceNames

Documented in checkSequenceNames

#' @title  MitoTrace - an R package for the investigation of mitochondrial heteroplasmies from single-cell RNA sequencing data.
#'
#' @description  The MitoTrace function calculates mitochondrial heteroplasmies in (single-cell) RNA sequencing data based on the reads pileups of the mitochondrial genome.
#'
#' @usage MitoTrace(bam_list = bams, ref_fasta = fasta_loc, name = "MT", max_depth = "",
#'        min_base_quality = "", min_mapq = "", min_nucleotide_depth = "", min_minor_allele_depth = "")
#'
#' @param bams_list Vector of absolute path(s) pointing to BAM alignment file(s).
#' @param ref_fasta Absolute path to the mitochondrial reference genome in FASTA format.
#' @param name Name of mitochondrial genome as specified in the BAM files. Sequence names can be check with the checkSequenceNames() function.
#' @param tag_name The name of the tag corresponding the cellular barcode. Default = "CB". For droplet scRNA-seq only.
#' @param barcodes The barcode list corresponding to the cells. For 10X genomics scRNA-seq data only.
#' @param min_read The minimum number of read counts to be considered a valid barcode (cell) in the analysis. Default = 1000. For droplet scRNAseq technologies only.
#' @param max_depth The maximum depth of reads considered at any position.
#' @param min_base_quality The minimum read base quality below which the base is ignored when summarizing pileup information.
#' @param min_mapq The minimum mapping quality below which the entire reads is ignored.
#' @param min_nucleotide_depth integer(1); minimum count of each nucleotide at a given position required for said nucleotide to appear in the result.
#' @param min_minor_allele_depth integer(1);  minimum count of all nucleotides other than the major allele at agiven position.
#'
#' @details result <- MitoTrace(bam_list = bams, ref_fasta = fasta_loc, chr_name = "MT", max_depth = "1e6", min_base_quality=25, min_mapq=30, min_nucleotide_depth=0, min_minor_allele_depth=0)
#' @details See packageDescription("MitoTrace") for more details.
#'
#' @note This package could not only apply for the analysis of single-cell data but also for bulk seuqencing data.
#'
#' @return Read counts matrix and coverage matrix.
#'
#' @author Mingqiang WANG <mqwangcuhk@gmail.com>, Simon Lukas <lkmklsmn@gmail.com>
#'
#' @references The current source code of MitoTrace is from https://github.com/lkmklsmn/MitoTrace.


# check BAM chromosome names
checkSequenceNames <- function(bam_list){
  require(Rsamtools)
  tmp <- scanBamHeader(bam_list[[1]])
  targets <- names(tmp[[1]][[1]])
  #text <- names(tmp[[1]][[2]])
  targets#[which(text == "@SQ")]
}

# define the MitoTrace main function
MitoTrace <- function(bam_list = bams,
                      fasta = fasta_loc,
                      chr_name = "MT",
                      tag_name = "CB",
                      barcodes = NULL,
                      min_read = 1000,
                      max_depth= 1e6,
                      min_base_quality= 25,
                      min_mapq = 30,
                      min_nucleotide_depth = 0,
                      min_minor_allele_depth = 0){
  require(Rsamtools)
  require(Matrix)
  require(seqinr)

#  if(length(bam_list) == 1) {
#    singlefile <- T
#    }
#  else{
#    singlefile=F
#    }

#  if(singlefile) bam_list <- rep(bam_list, 2)

  bases <- c("A", "C", "G", "T")

  # Load in reference FASTA
  reffasta <- seqinr::read.fasta(fasta)
  mitoChr = attr(reffasta, "name")
  maxpos = length(reffasta[[1]])
  reference_genome <- data.frame(
    postion = 1:maxpos,
    base = toupper(unname(reffasta)[[1]])[1:maxpos]
  )

  which <- GRanges(seqnames = chr_name, ranges = IRanges(1, maxpos))

  if(length(bam_list) == 1) combinedBam <- TRUE
  if(length(bam_list) > 1) combinedBam <- FALSE
  # Define list of BAM files
  bam_name_list_array <- BamFileList(bam_list)

  if(combinedBam){
    print("Extracting barcodes from single BAM file and running pileup for each barcode separately")

    params <- ScanBamParam(tag = tag_name, which = which)

    if(is.null(barcodes)){
      barcodes <- scanBam(bam_list, param = params)
      good_barcodes <- names(which(table(barcodes[[1]][[1]][[1]]) > min_read))
    }
    if(!is.null(barcodes)){
      good_barcodes <- barcodes
    }

    # Run pileup command
    total_mpileups <- lapply(good_barcodes, function(x){

      filter <- list(x)
      names(filter) <- tag_name

      pileup_bam <- pileup(bam_list,
                           scanBamParam=ScanBamParam(tagFilter = filter, which=which),
                           pileupParam=PileupParam(distinguish_strands= FALSE,
                                                   max_depth= max_depth,
                                                   min_base_quality=min_base_quality,
                                                   min_mapq=min_mapq,
                                                   min_nucleotide_depth=min_nucleotide_depth,
                                                   min_minor_allele_depth=min_minor_allele_depth
                           ))
      bases <- c("A", "T", "C", "G")
      base_counts <- lapply(bases, function(base){
        mutation <- subset(pileup_bam, nucleotide == base)
        data.frame(mutation$pos, mutation$count)
      })

      names(base_counts) <- bases
      base_counts

    })

    names(total_mpileups) <- good_barcodes
  }
  else{
    print("Running pileup on each BAM file separately")

    # Define list of BAM files
    bam_name_list_array <- BamFileList(bam_list)

    # Run pileup command
    total_mpileups <- lapply(bam_name_list_array, function(x){

      pileup_bam <- pileup(x,
                           scanBamParam=ScanBamParam(which=which),
                           pileupParam=PileupParam(distinguish_strands= FALSE,
                                                   max_depth= max_depth,
                                                   min_base_quality=min_base_quality,
                                                   min_mapq=min_mapq,
                                                   min_nucleotide_depth=min_nucleotide_depth,
                                                   min_minor_allele_depth=min_minor_allele_depth
                           ))
      bases <- c("A", "T", "C", "G")
      base_counts <- lapply(bases, function(base){
        mutation <- subset(pileup_bam, nucleotide == base)
        data.frame(mutation$pos, mutation$count)
      })

      names(base_counts) <- bases
      base_counts

    })

    nom <- unlist(lapply(bam_list, basename))
    names(total_mpileups) <- nom
  }

  # Count number of bases at each position
  res_counts <- lapply(bases, function(base){

    allpos <- 1:maxpos

    counts_base_allcells <- do.call(cbind, lapply(total_mpileups, function(x){
      count_base_cell <- x[[base]]
      count_base_cell$mutation.count[match(allpos, count_base_cell$mutation.pos)]
    }))

    counts_base_allcells[which(is.na(counts_base_allcells))] <- 0
    rownames(counts_base_allcells) <- paste0(allpos, reference_genome$base, ">", base)
    colnames(counts_base_allcells) <- names(total_mpileups)
    as(counts_base_allcells, "sparseMatrix")
  })
  names(res_counts) <- bases

  # Calculate total read coverage at each position
  coverage <- matrix(0, nrow(res_counts[[1]]), ncol(res_counts[[1]]))
  rownames(coverage) <- as.character(1:maxpos)
  colnames(coverage) <- colnames(res_counts[[1]])
  lapply(1:4, function(x) coverage <<- coverage + data.matrix(res_counts[[x]]))

  # Remove reference allele sites
  res_counts2 <- lapply(bases, function(base){
    tmp <- res_counts[[base]]
    tmp[which(reference_genome$base != base),]
  })
  names(res_counts2) <- bases

  # Merge into one table and order
  allpos <- unique(unlist(lapply(res_counts2, rownames)))
  pos_tmp <- allpos
  pos_tmp <- unlist(lapply(pos_tmp, function(x) substr(x, 1, nchar(x) - 3)))
  pos_tmp <- as.numeric(pos_tmp)
  allpos <- allpos[order(pos_tmp)]
  matr <- matrix(0, length(allpos), ncol(res_counts2[["A"]]))
  rownames(matr) <- allpos
  colnames(matr) <- colnames(res_counts2[["A"]])
  lapply(res_counts2, function(x){
    ok <- intersect(rownames(matr), rownames(x))
    matr[ok,] <<- data.matrix(x[ok,])
  })

  counts <- matr

  counts <- as(counts, "sparseMatrix")
  coverage <- as(coverage, "sparseMatrix")

#  if(singlefile){
#    nom <- colnames(counts)[1]
#    counts <- as.data.frame(counts[,1])
#    coverage <- as.data.frame(coverage[,1])
#    colnames(counts) <- colnames(coverage) <- nom
#  }

  return(list(read_counts = counts, coverage = coverage))
}

# define function to calculate allele frequency based on MitoTrace Rdata object
calc_allele_frequency <- function(object){
  tmp <- rownames(object[[1]])
  pos <- unlist(lapply(tmp, function(x) substr(x, 1, nchar(x) - 3)))
  pos <- as.numeric(pos)
  (object[[1]])/(object[[2]][pos,] + 0.0001)
}

# define the MitoTrace plot coverage depth function
MitoDepth <- function(mae = mae_res, species = "human", mt_ann = mt_ann){

  # set the gene label location
    if(species == "human"){
    location_human <- c(288.55, 577, 1124.55, 1602, 2450.05, 3230, 3784.55,
                3900, 4300, 4700, 4990.55, 5150, 5550, 5950, 6350, 6750,
                7150.55, 7300, 7718, 7927.55, 8295, 8469.05, 8867.05, 9598.55,
                9991, 10231.55, 10405, 10618.05, 11448.55, 11750, 12200, 12650,
                13242.55, 14411.05, 14674, 15317.05, 15700, 16200, 16584.55)
    }

    # set color
    require("RColorBrewer")
    col= c(brewer.pal(n = 12, name = "Set3"), brewer.pal(n = 11, name = "Spectral"), brewer.pal(n = 9, name = "Set1"), brewer.pal(n = 8, name = "Accent"))

    # plot the coverave depth # single sample
    ymax <- max(log(mae[[2]][,1]))
    plot(row.names(mae[[2]]), log(mae[[2]][,1]),
         type = "l",
         col="brown1",
         xlab="MT genome postion",
         ylab="log2(Coverage depth)",
         main="scRNA-seq coverage depth cross MT genome",
         ylim=c(-5,ymax))

    # plot the MT gene bar
    for (i in 1:39) {
      if(grepl("TR", mt_ann$gene[i])){
        rect(mt_ann[i,2], -2.5, mt_ann[i,3], -1, col=col[i], border = "white")
        text(location[i], -0.5, mt_ann$gene[i], cex = 0.9)
      }
      else if(grepl("RN", mt_ann$gene[i])){
        rect(mt_ann[i,2], -2.5, mt_ann[i,3], -1, col=col[i], border = "white")
        text(location[i], -0.5, mt_ann$gene[i], cex = 0.9)
      }
      else{
        rect(mt_ann[i,2], -4, mt_ann[i,3], -2.5, col=col[i], border = "white")
        text(location[i], -4.5, mt_ann$gene[i], cex = 0.9)
      }
    }

  }
lkmklsmn/MitoTrace documentation built on June 29, 2023, 5:55 a.m.