#' Get alignment and processing stats from BAM files
#'
#' Given a Logs/ folder generated by the processing functions in this package,
#' returns the statistics for alignment, chrM, duplication rate and number of
#' peaks called.
#' @param path_logs Path where the log files from samtools mrkdup are stored.
#' @param path_peaks Path where the peaks are saved.
#' @param peak_suffix Type of the peak files, either narrowPeak or broadPeak
#' @return Data.frame with some alignment and processing statistics for all the input samples.
#' When files are PE, it returns the total number of mates, not reads.
#' @export
#' @examples
#' \dontrun{
#' stats <- getStats(raw_bam=c("path/to/file.bam", "path/to/file2.bam"), path_logs="path/logs")
#' }
getStats <- function(path_logs="Logs/",
path_peaks="Peaks/",
peak_suffix="narrowPeak") {
names <- gsub(".raw.idxstats.log", "", list.files(path_logs, pattern=".raw.idxstats.log"))
stats <- lapply(names,
.getStatsSingle,
path_logs=path_logs,
path_peaks=path_peaks,
peak_suffix=peak_suffix)
stats <- as.data.frame(do.call(rbind, stats),
stringsAsFactors=FALSE)
return(stats)
}
.getStatsSingle <- function(name,
path_logs,
path_peaks,
peak_suffix) {
# Alignment stats from log file -----
al_log <- file.path(path_logs, paste0(name, ".alignment.log"))
if (file.exists(al_log)) {
log <- readLines(con = al_log)
if (!any(grepl("unpaired", log))) { # If stats are from PE file
paired <- TRUE
stats <- .parseBowtie2LogPE(file.path(path_logs, paste0(name, ".alignment.log")))
} else { # If stats are from SE file
paired <- FALSE
stats <- .parseBowtie2LogSE(file.path(path_logs, paste0(name, ".alignment.log")))
}
total_reads <- stats$total
total_aligned <- stats$unique + stats$multi
multi <- stats$multi
alignment_rate <- total_aligned/total_reads*100
} else {
paired <- FALSE
total_reads <- NA
total_aligned <- NA
multi <- NA
alignment_rate <- NA
}
# Chr stats
idxstats <- tableFromIdxstats(file.path(path_logs, paste0(name, ".raw.idxstats.log")))
chrM <- idxstats$aligned[idxstats$chr=="chrM"]
if (is.na(total_aligned)) {
total_aligned <- sum(idxstats$aligned)
total_reads <- sum(idxstats$aligned) + sum(idxstats$not_aligned)
}
# Get duplicates stats
if (file.exists(file.path(path_logs, paste0(name, ".rmdup.log")))) {
dup.tab <- read.delim(file.path(path_logs, paste0(name, ".rmdup.log")),
stringsAsFactors=FALSE, header=FALSE)
duplicates <- as.numeric(strsplit(dup.tab[grep("DUPLICATE TOTAL", dup.tab[,1]),1], ": ")[[1]][2])
} else {
duplicates <- NA
}
# Get number of peaks called
files <- list.files(path_peaks,
pattern=peak_suffix,
full.names=T)
files <- files[grep(name, files)]
if (length(files)==1) {
peaks <- as.numeric(strsplit(system(paste("wc -l", files), intern = T), " ")[[1]][1])
} else {
peaks <- NA
}
# Get final number of reads
stats_final <-tableFromIdxstats(file.path(path_logs, paste0(name, ".idxstats.log")))
final_reads <- sum(stats_final$aligned)
if (paired) {
chrM <- chrM/2
duplicates <- duplicates/2
final_reads <- final_reads/2
}
# Create data.frame
df <- data.frame(sampleID=name,
total_reads=total_reads,
aligned_reads=total_aligned,
multi_reads=multi,
chrM_reads=chrM,
duplicate_reads=duplicates,
final_reads=final_reads,
alignment_rate=total_aligned/total_reads*100,
chrM_rate=chrM/total_reads*100,
duplicate_rate=duplicates/total_reads*100,
final_rate=final_reads/total_reads*100,
num_peaks=peaks)
}
.parseBowtie2LogSE <- function(logfile) {
log <- readLines(con = logfile)
total <- .obtainReadNumber(log[grep("were unpaired", log)])
unaligned <- .obtainReadNumber(log[grep("aligned 0 times", log)])
unique <- .obtainReadNumber(log[grep("aligned exactly 1 time", log)])
multi <- .obtainReadNumber(log[grep("aligned >1 times", log)])
stats <- data.frame(total, unaligned, unique, multi)
return(stats)
}
.parseBowtie2LogPE<- function(logfile) {
log <- readLines(con = logfile)
total <- .obtainReadNumber(log[grep("were paired", log)])*2
unaligned <- .obtainReadNumber(log[grep(") aligned concordantly 0 times", log)])
unique <- .obtainReadNumber(log[grep("aligned concordantly exactly 1 time", log)])
multi <- .obtainReadNumber(log[grep(" aligned concordantly >1 times", log)])
stats <- data.frame(total/2, unaligned, unique, multi)
return(stats)
}
.obtainReadNumber <- function(line) {
as.numeric(strsplit(gsub(" ", "", line), "(", fixed=TRUE)[[1]][1])
}
#' Table form idxstats
#'
#' Given a bam file, returns samtools idxstats output as a data.frame.
#' @param bam Character string with the name and path of a single BAM file. Index should
#' be in the same folder.
#' @return A data.frame with the output of samtools idxstats.
#' @export
#' @examples
#' \dontrun{
#' df <- tableFromIdxstats("path/to/file.bam")
#' }
tableFromIdxstats <- function(log) {
stats <- read.delim(log)
colnames(stats) <- c("chr", "chr_len", "aligned", "not_aligned")
stats$chr_len <- as.numeric(stats$chr_len)
stats$aligned <- as.numeric(stats$aligned)
stats$not_aligned <- as.numeric(stats$not_aligned)
return(stats)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.