R/getStats.R

Defines functions tableFromIdxstats .obtainReadNumber .parseBowtie2LogPE .parseBowtie2LogSE .getStatsSingle getStats

Documented in getStats tableFromIdxstats

#' Get alignment and processing stats from BAM files
#'
#' Given a Logs/ folder generated by the processing functions in this package,
#' returns the statistics for alignment, chrM, duplication rate and number of
#' peaks called.
#' @param path_logs Path where the log files from samtools mrkdup are stored.
#' @param path_peaks Path where the peaks are saved.
#' @param peak_suffix Type of the peak files, either narrowPeak or broadPeak
#' @return Data.frame with some alignment and processing statistics for all the input samples.
#' When files are PE, it returns the total number of mates, not reads.
#' @export
#' @examples
#' \dontrun{
#' stats <- getStats(raw_bam=c("path/to/file.bam", "path/to/file2.bam"), path_logs="path/logs")
#' }
getStats <- function(path_logs="Logs/",
                     path_peaks="Peaks/",
                     peak_suffix="narrowPeak") {

  names <- gsub(".raw.idxstats.log", "", list.files(path_logs, pattern=".raw.idxstats.log"))
  stats <- lapply(names,
                  .getStatsSingle,
                  path_logs=path_logs,
                  path_peaks=path_peaks,
                  peak_suffix=peak_suffix)
  stats <- as.data.frame(do.call(rbind, stats),
                         stringsAsFactors=FALSE)
  return(stats)
}

.getStatsSingle <- function(name,
                            path_logs,
                            path_peaks,
                            peak_suffix) {

  # Alignment stats from log file -----
  al_log <- file.path(path_logs, paste0(name, ".alignment.log"))
  if (file.exists(al_log)) {
    log <- readLines(con = al_log)

    if (!any(grepl("unpaired", log))) {   # If stats are from PE file
      paired <- TRUE
      stats <- .parseBowtie2LogPE(file.path(path_logs, paste0(name, ".alignment.log")))
    } else { # If stats are from SE file
      paired <- FALSE
      stats <- .parseBowtie2LogSE(file.path(path_logs, paste0(name, ".alignment.log")))
    }

    total_reads <- stats$total
    total_aligned <- stats$unique + stats$multi
    multi <- stats$multi
    alignment_rate <- total_aligned/total_reads*100

  } else {
    paired <- FALSE
    total_reads <- NA
    total_aligned <- NA
    multi <- NA
    alignment_rate <- NA
  }

  # Chr stats
  idxstats <- tableFromIdxstats(file.path(path_logs, paste0(name, ".raw.idxstats.log")))
  chrM <- idxstats$aligned[idxstats$chr=="chrM"]

  if (is.na(total_aligned)) {
    total_aligned <- sum(idxstats$aligned)
    total_reads <- sum(idxstats$aligned) + sum(idxstats$not_aligned)
  }

  # Get duplicates stats
  if (file.exists(file.path(path_logs, paste0(name, ".rmdup.log")))) {
    dup.tab <- read.delim(file.path(path_logs, paste0(name, ".rmdup.log")),
                          stringsAsFactors=FALSE, header=FALSE)
    duplicates <- as.numeric(strsplit(dup.tab[grep("DUPLICATE TOTAL", dup.tab[,1]),1], ": ")[[1]][2])
  } else {
    duplicates <- NA
  }

  # Get number of peaks called
  files <- list.files(path_peaks,
                      pattern=peak_suffix,
                      full.names=T)
  files <- files[grep(name, files)]

  if (length(files)==1) {
    peaks <- as.numeric(strsplit(system(paste("wc -l", files), intern = T), " ")[[1]][1])
  } else {
    peaks <- NA
  }

  # Get final number of reads
  stats_final <-tableFromIdxstats(file.path(path_logs, paste0(name, ".idxstats.log")))
  final_reads <- sum(stats_final$aligned)

  if (paired) {
    chrM <- chrM/2
    duplicates <- duplicates/2
    final_reads <- final_reads/2

  }

  # Create data.frame
  df <- data.frame(sampleID=name,
                   total_reads=total_reads,
                   aligned_reads=total_aligned,
                   multi_reads=multi,
                   chrM_reads=chrM,
                   duplicate_reads=duplicates,
                   final_reads=final_reads,
                   alignment_rate=total_aligned/total_reads*100,
                   chrM_rate=chrM/total_reads*100,
                   duplicate_rate=duplicates/total_reads*100,
                   final_rate=final_reads/total_reads*100,
                   num_peaks=peaks)
}

.parseBowtie2LogSE <- function(logfile) {
  log <- readLines(con = logfile)
  total <- .obtainReadNumber(log[grep("were unpaired", log)])
  unaligned <- .obtainReadNumber(log[grep("aligned 0 times", log)])
  unique <- .obtainReadNumber(log[grep("aligned exactly 1 time", log)])
  multi <- .obtainReadNumber(log[grep("aligned >1 times", log)])

  stats <- data.frame(total, unaligned, unique, multi)
  return(stats)
}

.parseBowtie2LogPE<- function(logfile) {
  log <- readLines(con = logfile)
  total <- .obtainReadNumber(log[grep("were paired", log)])*2
  unaligned <- .obtainReadNumber(log[grep(") aligned concordantly 0 times", log)])
  unique <- .obtainReadNumber(log[grep("aligned concordantly exactly 1 time", log)])
  multi <- .obtainReadNumber(log[grep(" aligned concordantly >1 times", log)])

  stats <- data.frame(total/2, unaligned, unique, multi)
  return(stats)
}


.obtainReadNumber <- function(line) {
  as.numeric(strsplit(gsub(" ", "", line), "(", fixed=TRUE)[[1]][1])
}

#' Table form idxstats
#'
#' Given a bam file, returns samtools idxstats output as a data.frame.
#' @param bam Character string with the name and path of a single BAM file. Index should
#' be in the same folder.
#' @return A data.frame with the output of samtools idxstats.
#' @export
#' @examples
#' \dontrun{
#' df <- tableFromIdxstats("path/to/file.bam")
#' }
tableFromIdxstats <- function(log) {
  stats <- read.delim(log)
  colnames(stats) <- c("chr", "chr_len", "aligned", "not_aligned")
  stats$chr_len <- as.numeric(stats$chr_len)
  stats$aligned <- as.numeric(stats$aligned)
  stats$not_aligned <- as.numeric(stats$not_aligned)
  return(stats)
}
mireia-bioinfo/pipelineNGS documentation built on Jan. 2, 2023, 11:18 a.m.