#' Wrapper scripts for SAMtools functions
#'
#' @description \code{run_samsort} sorts alignment (SAM/BAM/CRAM) files either by position
#' or read name using \code{samtools sort}.
#'
#' @param file A vector of characters specifying the path to the bam files.
#' @param samtools The path to samtools (if not in executable path).
#' @param outformat String specifying output format: ('SAM'/'BAM'/'CRAM'). For
#' \code{run_samview} this should be SAM or 'BAM' only.
#' @param threads A positive integer specifying the number of sorting and
#' compression threads
#' @param memory String specifying maximum memory per thread; suffix K/M/G
#' recognized.
#' @param sortbyname Boolean. If TRUE, reads are sorted by name. If FALSE,
#' reads are sorted by chromosome/position.
#' @param suffix Suffix to add to the basename of the file e.g, _psort (for
#' position-sorted).
#' @param keep Boolean. If TRUE, the input file is kept. If FALSE, the input
#' file is deleted after a successful sort.
#'
#' @return \code{run_samsort} returns a sorted alignment file.
#'
#' @examples
#' \dontrun{
#' run_samsort(file = "HB1_sample.sam", samtools = "samtools",
#' outformat = "BAM", threads = (parallel::detectCores() - 1),
#' memory = "1G", sortbyname = FALSE, suffix = "", keep = TRUE)
#'
#' }
#'
#' @export
run_samsort <- function(file = NULL,
samtools = "samtools",
outformat = "BAM",
threads = 1,
memory = "768M",
sortbyname = FALSE,
suffix = "",
keep = TRUE) {
if (!(outformat %in% c("SAM","BAM","CRAM"))) {
stop("output format must be \"SAM\" \"BAM\" or \"CRAM\"")
}
if (tolower(outformat) == tools::file_ext(file) && suffix == "") {
stop("output and input files must have different names - would you like to
add a suffix?")
}
bname <- reduce_path(file)
outfile <- paste0(bname, suffix, ".", tolower(outformat))
temp <- paste0(outfile, "_tmp")
if (isTRUE(sortbyname)) {
sam.sort <- sprintf("%s sort -n -@ %s -m %s -T %s -O %s -o %s %s",
samtools, threads, memory, temp, outformat,
outfile, file)
} else {
sam.sort <- sprintf("%s sort -@ %s -m %s -T %s -O %s -o %s %s",
samtools, threads, memory, temp, outformat,
outfile, file)
}
for (i in seq_along(sam.sort)) {
give_note(
paste("Creating position-sorted BAM files using SAMtools with the following command:\n\n",
sam.sort[[i]], "\n")
)
run_cmd(sam.sort[[i]])
}
if (!isTRUE(keep) && file.exists(outfile)) {unlink(file)}
return(outfile)
}
#' @description \code{run_samindex} indexes sorted BAM files using
#' \code{samtools index}.
#' @param bamfile Vector of characters specifying the path to sorted BAM files.
#' @rdname run_samsort
#' @return \code{run_samindex} returns a BAI-format index file for sorted
#' BAM files.
#' @examples
#' \dontrun{
#' run_samindex(samtools = "samtools", bamfile = "HB1_sample.bam",
#' threads = (parallel::detectCores() - 1))
#' }
#'
#' @export
run_samindex <- function(samtools = "samtools",
bamfile = NULL,
threads = 1) {
sam.index <- sprintf("%s index -b -@ %s %s",
samtools, threads, bamfile)
for (i in seq_along(sam.index)) {
run_cmd(sam.index[[i]])
}
}
#' @description \code{run_samview} outputs all alignments matching the flag and
#' region filters specified in either SAM or BAM format using
#' \code{samtools view}.
#' @param regions Either path to a single BED file containing regions of
#' interest or regions defined in the format "RNAME:START[-END]".
#' @param chrom.sizes Path to chromosome size file. Required if converting to
#' SAM format and including header. If not available, then one will be
#' generated automatically from the existing header. This is a tab-delimited
#' text file where each line contains the reference name in the first column
#' and length in the second column.
#' @param include.flag Integer value. Include output of alignments with any
#' bits set in this value present in the FLAG field. See
#' \url{https:://broadinstitute.github.io}.
#' @param exclude.flag Integer value. Exclude output of alignments with any
#' bits set in this value present in the FLAG field. See
#' \url{https:://broadinstitute.github.io}.
#' @param minQual Skip alignments with MAPQ smaller than this value.
#' @param outname Name for output file.
#' @param include.header Boolean. If TRUE header is included. This option is
#' only necessary if outformat = "SAM" as header always included in "BAM".
#' @param count Boolean. If true, only count alignments instead of printing.
#' @param subsample Float value - if set then subsampling is performed. The
#' integer part is used to seed the random number generator and the part after
#' the decimal sets the fraction of reads to subsample.
#' @param keep.paired Boolean. If TRUE, keep paired reads only.
#' @param keep.proper.pair Boolean. If TRUE, keep concordantly paired reads
#' only.
#' @param remove.unmapped Boolean. If TRUE, remove unmapped reads.
#' @param remove.not.primary Boolean. If TRUE, remove reads mapped as secondary
#' alignments.
#' @param remove.duplicates Boolean. If TRUE, remove reads marked as optical or
#' PCR duplicates.
#' @param remove.supplementary.alignment Boolean. If TRUE, remove reads marked
#' as supplementary alignment.
#' @param remove.mitochondrial Character string. If set, this will remove reads
#' mapping to the mitochondrial genome. The string should match the reference
#' name for the mitochindrial genome in the alignment file. Examples include
#' "ChrM", "M" and "MT".
#' @rdname run_samsort
#' @return \code{run_samview} returns aligned reads according to the regions and
#' filters specified in BAM or SAM format.
#' @examples
#' \dontrun{
#' run_samview(samtools = "samtools", file = "HB1_sample.bam", regions = NULL,
#' chrom.sizes = NULL, include.flag = NULL, exclude.flag = NULL,
#' minQual = NULL, outformat = "BAM", outname = NULL, include.header = FALSE,
#' count = FALSE, threads = (parallel::detectCores() - 1), subsample = NULL,
#' keep.paired = TRUE, keep.proper.pair = TRUE, remove.unmapped = TRUE,
#' remove.not.primary = TRUE, remove.duplicates = TRUE,
#' remove.supplementary.alignment = TRUE, remove.mitochondrial = "ChrM")
#' }
#'
#' @export
run_samview <- function(samtools = "samtools",
file = NULL,
regions = NULL,
chrom.sizes = NULL,
include.flag = NULL,
exclude.flag = NULL,
minQual = NULL,
outformat = NULL,
outname = NULL,
include.header = FALSE,
count = FALSE,
threads = 1,
subsample = NULL,
keep.paired = TRUE,
keep.proper.pair = TRUE,
remove.unmapped = FALSE,
remove.not.primary = FALSE,
remove.duplicates = FALSE,
remove.supplementary.alignment = FALSE,
remove.mitochondrial = NULL) {
if (!(outformat %in% c("SAM","BAM","CRAM"))) {
stop("output format must be \"SAM\" \"BAM\" or \"CRAM\"")
}
if (isTRUE(include.header) && is.null(chrom.sizes) && outformat == "BAM") {
sam.idx <- sprintf("%s idxstats %s > chrom.sizes", samtools, file)
chrom.sizes <- "chrom.sizes"
on.exit(unlink("chrom.sizes"))
}
bname <- reduce_path(file)
if (is.null(outname)) outname <- paste0(bname, "_filtered.",
tolower(outformat))
if (!is.null(regions)) {
if (file.exists(regions) & length(regions) == 1) {
oneRegion = FALSE
rstring <- paste("-L", regions)
} else if (grepl(":", regions) & grepl("-", regions)) {
oneRegion = TRUE
rstring <- regions
} else {
give_error("The region argument must be a single bed file")
}
} else {
oneRegion <- FALSE
}
sam.view <- sprintf("%s view", samtools)
if (outformat == "BAM") sam.view <- sprintf("%s -b", sam.view)
if (isTRUE(include.header)) sam.view <- sprintf("%s -h -t %s",
sam.view, chrom.sizes)
if (isTRUE(count)) sam.view <- sprintf("%s -c", sam.view)
if (!is.null(minQual)) sam.view <- sprintf("%s -q %s", sam.view, minQual)
if (!is.null(include.flag)) sam.view <- sprintf("%s -f %s", sam.view,
include.flag)
if (!is.null(exclude.flag)) sam.view <- sprintf("%s -F %s", sam.view,
exclude.flag)
if (isTRUE(keep.paired)) sam.view <- sprintf("%s -f 2", sam.view)
if (isTRUE(keep.proper.pair)) sam.view <- sprintf("%s -f 3", sam.view)
if (isTRUE(remove.unmapped)) sam.view <- sprintf("%s -F 4", sam.view)
if (isTRUE(remove.not.primary)) sam.view <- sprintf("%s -F 256", sam.view)
if (isTRUE(remove.duplicates)) sam.view <- sprintf("%s -F 1024", sam.view)
if (isTRUE(remove.supplementary.alignment)) sam.view <- sprintf("%s -F 2056",
sam.view)
if (!is.null(subsample)) sam.view <- sprintf("%s -s %s", sam.view, subsample)
if (!is.null(regions) & !isTRUE(oneRegion)) sam.view <- sprintf("%s %s",
sam.view,
rstring)
sam.view <- sprintf("%s %s", sam.view, file)
if (!is.null(regions) & isTRUE(oneRegion)) sam.view <- sprintf("%s %s",
sam.view,
rstring)
sam.view <- sprintf("%s > %s", sam.view, outname)
if (!is.null(remove.mitochondrial)) {
sam.view <- sprintf("%s idxstats %s | cut -f 1 | grep -v %s | xargs %s",
samtools, file, remove.mitochondrial, sam.view)
}
if (isTRUE(count)) {
res <- vector("list", length(sam.view))
for (i in seq_along(sam.view)) {
res[[i]] <- run_cmd(cmd = sam.view[[i]], intern = TRUE)
res[[i]] <- as.numeric(unlist(res[[i]]))
}
return(res)
} else {
for (i in seq_along(sam.view)) {
give_note(
paste("Filtering aligned reads using samtools with the following command:\n\n",
sam.view[[i]])
)
run_cmd(cmd = sam.view[[i]])
}
return(outname)
}
}
#' @description \code{run_samflagstat} uses \code{samtools flagstat} to
#' calculate and print statistics from a BAM file. It provides counts for 13
#' categories of reads: total, secondary, supplementary, duplicates, mapped,
#' paired in sequencing, read1, read2, properly paired, with itself and its mate
#' mapped, singletons, mate mapped to a different chromosome, with mate mapped
#' to a different chromosome with MAPQ>5.
#' @rdname run_samsort
#' @return \code{run_samflagstat} returns a dataframe of alignment statistics.
#' @examples
#' \dontrun{
#' run_samflagstat(samtools = "samtools", threads = parallel::detectCores(),
#' bamfile = "HB1_sample.bam")
#' }
#'
#' @export
run_samflagstat <- function(bamfile = NULL,
samtools = "samtools",
threads = 1) {
sam.flag <- sprintf("%s flagstat -@ %s %s", samtools, threads, bamfile)
res <- vector("list", length(sam.flag))
for (i in seq_along(sam.flag)) {
res[[i]] <- run_cmd(sam.flag[[i]], intern = TRUE)
res[[i]] <- lapply(res[[i]], extract_numbers)
res[[i]] <- lapply(res[[i]], as.integer)
Flag <- c("Total alignments + unaligned", "Marked secondary",
"Marked supplementary", "Marked duplicate", "Mapped",
"Paired in sequencing", "Mate 1",
"Mate 2", "Aligned as proper pair",
"Both mates mapped", "Aligned as singleton",
"Mate mapped to a different chr",
"Mate mapped to a different chr and MAPQ>5")
res[[i]] <- data.frame(Flag = Flag,
QC.passed = sapply(res[[i]], function(x) x[1]),
QC.failed = sapply(res[[i]], function(x) x[2]))
total <- res[[i]][1, 2] + res[[i]][1, 3]
res[[i]] <- c(total, res[[i]][,2])
names(res[[i]]) <- c(Flag[1], "QC-passed", Flag[2:13])
}
if (length(res) == 1) {
res <- as.data.frame(res[[1]])
} else {
res <- as.data.frame(do.call(cbind, res))
}
names(res) <- bamfile
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.