R/process_reads.R

Defines functions get_total_reads sort_index_idxstats_bam index_fa

Documented in get_total_reads index_fa sort_index_idxstats_bam

#' @title Indexes the FASTA file, and generates a \sQuote{chrom.sizes} file
#' @description Indexes the FASTA file, and generates a \sQuote{chrom.sizes} file
#' with chromosome numbers and the size of each chromosome.
#' @param fa_file FASTA file of reference genome sequence.
#' @param chromsize_file Path of the output \sQuote{chrom.sizes} file.
#' @param outdir Output directory (default: saves to the directory of the FASTA file).
#' @export
#' @examples
#' \dontrun{
#' index_fa('hg38.fa', chromsize_file='hg38.chrom.sizes')
#' }
index_fa <- function(fa_file,
                     chromsize_file='chrom.sizes',
                     outdir=dirname(fa_file)){

  cat('Indexing FASTA file and generating chrom.sizes file ...\n')
  faidx_file <- Rsamtools::indexFa(fa_file)
  faidx <- read.table(faidx_file, sep='\t')
  write.table(faidx[,1:2], chromsize_file, quote=FALSE,
              col.names=FALSE, row.names=FALSE, sep='\t')

}

#' @title Sorts, indexes the BAM file, and retrieves the \code{idxstats}.
#' @description Sorts and indexes the BAM file, and
#' retrieves the \code{idxstats} (summary of
#' the number of mapped reads on every chromosome) using \code{Rsamtools} package.
#' @param bam_file Input BAM file.
#' @param sorted_bam_file Output file name for sorted BAM file if \code{sort=TRUE}.
#' @param sort Logical. If TRUE, sorts the BAM file.
#' @param index Logical. If TRUE, indexes the BAM file.
#' @param idxstats Logical. If TRUE, retrieves \code{idxstats} summary of
#' the number of mapped reads in each chromosome.
#' @export
#' @examples
#' \dontrun{
#' # Sorts, indexes the BAM file, and retrieves the idxstats.
#' sort_index_idxstats_bam('example.bam', sort=TRUE, index=TRUE, idxstats=TRUE)
#'
#' # Indexes the BAM file, and retrieves the idxstats, using sorted BAM file.
#' sort_index_idxstats_bam('example.sorted.bam', sort=FALSE, index=TRUE, idxstats=TRUE)
#' }
sort_index_idxstats_bam <- function(bam_file,
                                    sorted_bam_file,
                                    sort=TRUE,
                                    index=TRUE,
                                    idxstats=TRUE) {

  if( sort ) {
    # Sort BAM file
    if(missing(sorted_bam_file)){
      bam_prefix <- gsub('\\.bam','',basename(bam_file))
      sorted_bam_file <- file.path(dirname(bam_file), paste0(bam_prefix, '.sorted.bam'))
    }
    cat('Sorting BAM file...\n')
    Rsamtools::sortBam(bam_file, gsub('\\.bam','',basename(sorted_bam_file)))
  }else{
    sorted_bam_file <- bam_file
  }

  if( index ) {
    # Index BAM file
    cat('Indexing BAM file...\n')
    Rsamtools::indexBam(sorted_bam_file)
  }

  if( idxstats ){
    # Get idxstats (number of mapped reads on every chromosomes)
    cat('Retrieving idxstats ...\n')
    bam_prefix <- gsub('\\.bam','',basename(sorted_bam_file))
    idxstats_file <- file.path(dirname(bam_file), paste0(bam_prefix, '.bam.idxstats.txt'))
    idxstats <- Rsamtools::idxstatsBam(sorted_bam_file)
    write.table(idxstats, idxstats_file, col.names=TRUE, row.names = FALSE,
                quote=FALSE, sep='\t')
  }

}


#' @title Gets total number of mapped reads from the \code{idxstats} file
#' @description Loads the \code{idxstats} file, and counts
#' the total number of mapped reads generated by \code{bam_sort_index_stats}.
#' @param idxstats_file \code{idxstats} file.
#' @param select_chr Logical. If TRUE, counts reads in
#' chromosomes specified in \sQuote{chrs}.
#' Otherwise, uses all chromosomes in \code{idxstats_file}.
#' @param chrs Chromosomes to be included (default: chr1, ..., chr22).
#' @return The total number of mapped reads.
#' @export
#' @examples
#' \dontrun{
#' total_mapped_reads <- get_total_reads('sample.bam.idxstats.txt')
#' }
get_total_reads <- function(idxstats_file, select_chr = TRUE, chrs = paste0('chr', c(1:22))) {
  count.df <- as.data.frame(data.table::fread(idxstats_file, sep = '\t'))
  names(count.df) <- c('chr', 'length', 'reads_mapped', 'reads_unmapped')

  if( select_chr ){
    total_count <- sum(count.df[which(count.df$chr %in% chrs), 'reads_mapped'])
  } else {
    total_count <- sum(count.df[, 'reads_mapped'])
  }

  return(total_count)
}
HarteminkLab/TOP documentation built on July 27, 2023, 6:14 p.m.