#!/usr/bin/env Rscript
# Title: Process bam files using txtools to create data.tables ########
# Author: Miguel Angel Garcia-Campos https://angelcampos.github.io/ ############
suppressPackageStartupMessages(require(optparse))
t0 <- Sys.time() # Start time
# Parsing Arguments ############################################################
option_list = list(
make_option(c("-i", "--BAMfile"), type = "character", default = NULL,
help="Input mapped/algined reads file in BAM format",
metavar="character"),
make_option(c("-p", "--pairedEnd"), type = "logical", default = TRUE,
help="Set to 'FALSE' for loading single-end reads alignments [default = %default]",
metavar="logical"),
make_option(c("-g", "--BEDfile_geneAnnot"), type = "character", default = NULL,
help="Gene annotation file in BED-12 or BED-6 format (no introns)",
metavar="character"),
make_option(c("-f", "--FASTAfile"), type = "character", default = NULL,
help="Genome file in fasta format. If left empty the reference sequence will not be added [default = %default]",
metavar="character"),
make_option(c("-d", "--DT_datatype"), type = "character", default = NULL,
help="Data.table type to output: 'cov' = coverage, 'covNuc' = Coverage & Nucleotide Frequency",
metavar="character"),
make_option(c("-o", "--outName"), type = "character", default = "auto",
help="Output name, expected to end in '.rds'. By default the BAM file name will be used changing '.bam' for '.txDT.rds'",
metavar="character"),
make_option(c("-r", "--removeLongerThan"), type = "integer", default = NA,
help="Maximum length of reads to output [default = %default]. By default will not remove reads.",
metavar="integer"),
make_option(c("-m", "--minReadsGene"), type = "integer", default = 50,
help="Minimum number of reads to report a gene in annotation [default = %default]",
metavar="integer"),
make_option(c("-R", "--rescueSingletons"), type = "logical", default = FALSE,
help="Separate mapped singletons and process separately to rescue those mapped reads (only for pairedEnd == TRUE). [default = %default]",
metavar="logical"),
make_option(c("-s", "--strandMode"), type = "integer", default = 1,
help= paste0("Strand mode for loading BAM file. [default = %default]",
"\n 1 = Direction is given by read 1 e.g. Directional Illumina",
"\n 2 = Direction is given by read 2 (or just inverted in single-end files), e.g. Illumina TruSeq PE"),
metavar="integer"),
make_option(c("-S", "--ignoreStrand"), type = "logical", default = FALSE,
help="Ignore strand of mapping, use for unstranded library preparations. [default = %default]",
metavar="logical"),
make_option(c("-n", "--nCores"), type = "integer", default = 2,
help="Number of cores to be used [default= %default]",
metavar="integer"),
make_option(c("-y", "--yieldSize"), type = "integer", default = 100000,
help="Number of BAM records to be processed at a time [default = %default]",
metavar="integer"),
make_option(c("-v", "--verbose"), type = "logical", default = TRUE,
help="Set to 'FALSE' for showing task progress information [default = %default]",
metavar="logical")
)
opt_parser <- OptionParser(option_list=option_list);
opt <- parse_args(opt_parser)
BAMfile <- opt$BAMfile
paired <- opt$pairedEnd
geneAnnot <- opt$BEDfile_geneAnnot
genome <- opt$FASTAfile
dtType <- opt$DT_datatype
remL <- opt$removeLongerThan
minR <- opt$minReadsGene
strM <- opt$strandMode
nCores <- opt$nCores
ySize <- opt$yieldSize
verb <- opt$verbose
rescueSingle <- opt$rescueSingletons
ignStrand <- opt$ignoreStrand
outName <- opt$outName
# Checking/loading packages ############################################################
pkgs <- c("BiocManager", "magrittr", "parallel", "Rsamtools", "GenomicRanges", "txtools")
insPkgs <- pkgs[!pkgs %in% as.character(installed.packages()[,1])]
if(length(insPkgs) > 0){
stop("The following packages are missing: ", paste0("'", insPkgs, "'", collapse = " "), ".\n")
}else{
library("BiocManager")
library("magrittr")
library("parallel")
library("Rsamtools")
library("GenomicRanges")
library("txtools")
}
txtools_minVersion <- "0.1.2"
if(packageVersion("txtools") < txtools_minVersion){
stop("txtools needs to be version ", txtools_minVersion, " or higher\n",
"Please update it following instructions at: https://github.com/AngelCampos/txtools\n")
}
# Process arguments ############################################################
# Load sequence or not
if(is.null(dtType)){
stop("-d --DT_datatype argument cannot be left empty\n")
}else if(dtType == "cov"){
lSeq <- FALSE
}else if(dtType == "covNuc"){
lSeq <- TRUE
}else{
stop("-d --DT_datatype argument must be one of the options: 'cov' or 'covNuc'\n")
}
# Check yieldSize to be integer
txtools:::check_integer_arg(ySize, "ySize")
# Set OUTPUT filename
if(outName == "auto"){
outName <- basename(BAMfile) %>%
gsub(x = ., pattern = ".bam$", replacement = ".txDT.rds", perl = T)
}
# Output all genes even with no reads overlapping
if(minR == 0){makeFULL <- TRUE}else{makeFULL <- FALSE}
if(minR == 0){minR <- 1}
# MAIN program #################################################################
# Load gene annotation
gA <- tx_load_bed(geneAnnot)
if(verb){
cat("Gene annotation loaded with", length(gA), "gene models.")
}
# Load genome or NULL
if(!is.null(genome)){
GENOME <- tx_load_genome(genome)
}else{
GENOME <- NULL
}
# Load BAM file
if(!rescueSingle){
bam <- tx_load_bam(file = BAMfile,
yieldSize = ySize,
scanFlag = "default",
loadSeq = lSeq,
verbose = verb,
strandMode = strM,
pairedEnd = paired)
t1 <- Sys.time() # Loading BAM file time
bamLen <- length(bam)
}else if(rescueSingle){
if(!paired){stop("--pairedEnd argument is expected to be TRUE if --rescueSingletons is set to TRUE\n")}
filtParam_1 <- ScanBamParam(flag = scanBamFlag(isUnmappedQuery = FALSE, hasUnmappedMate = FALSE)) # Fully paired
filtParam_2 <- ScanBamParam(flag = scanBamFlag(isUnmappedQuery = FALSE, hasUnmappedMate = TRUE)) # Rescuable
tmpFile1 <- tempfile(); tmpFile2 <- tempfile()
invisible(filterBam(file = BAMfile, destination = tmpFile1, param = filtParam_1))
invisible(filterBam(file = BAMfile, destination = tmpFile2, param = filtParam_2))
tmpBAM_1 <- tx_load_bam(file = tmpFile1,
pairedEnd = TRUE,
loadSeq = lSeq,
verbose = verb,
yieldSize = ySize,
scanFlag = "default",
strandMode = strM)
if(verb){cat("\nLoading singletons bam file\n")}
tmpBAM_2 <- tx_load_bam(file = tmpFile2,
pairedEnd = FALSE,
loadSeq = lSeq,
verbose = verb,
yieldSize = ySize,
scanFlag = "default",
strandMode = strM)
t1 <- Sys.time() # Loading BAM files
invisible(file.remove(c(tmpFile1, tmpFile2)))
bamLen <- length(tmpBAM_1) + length(tmpBAM_2)
}
# txtools:::hlpr_ReadsInGene
# Convert to transcriptomic
if(!rescueSingle){
txReads <- tx_reads(reads = bam,
geneAnnot = gA,
nCores = nCores,
minReads = minR,
withSeq = lSeq,
verbose = verb,
ignore.strand = ignStrand)
}else if(rescueSingle){
tmp_txReads_1 <- tx_reads(reads = tmpBAM_1,
minReads = minR,
geneAnnot = gA,
nCores = nCores,
withSeq = lSeq,
verbose = verb,
ignore.strand = ignStrand)
tmp_txReads_2 <- tx_reads(reads = tmpBAM_2,
geneAnnot = gA,
nCores = nCores,
withSeq = lSeq,
verbose = verb,
ignore.strand = ignStrand)
# Merge lists gene-wise
allGenes <- union(names(tmp_txReads_1), names(tmp_txReads_2))
txReads <- parallel::mclapply(mc.cores = nCores, allGenes, function(gene){
tmp1 <- tmp_txReads_1[[gene]]
tmp2 <- tmp_txReads_2[[gene]]
if(is.null(tmp1)){
seqlevels(tmp2) <- allGenes
return(tmp2)
}else if(is.null(tmp2)){
seqlevels(tmp1) <- allGenes
return(tmp1)
}else{
seqlevels(tmp1) <- allGenes; seqlevels(tmp2) <- allGenes
return(c(tmp1, tmp2))
}
}) %>% GenomicRanges::GRangesList() %>%
magrittr::set_names(allGenes) %>%
GenomicRanges::GRangesList()
}
# Filter by length
if(!is.na(remL)){
txReads <- tx_filter_maxWidth(x = txReads, thr = remL, nCores = nCores)
}
# Data.table generation
if(dtType == "cov"){
if(verb){
cat("Generating coverage data.table")
}
OUT <- tx_makeDT_coverage(x = txReads,
geneAnnot = gA,
nCores = nCores,
fullDT = makeFULL,
genome = GENOME)
}else if(dtType == "covNuc"){
if(verb){
cat("Generating coverage and nucleotide frequency data.table. \n")
}
OUT <- tx_makeDT_covNucFreq(x = txReads,
geneAnnot = gA,
nCores = nCores,
fullDT = makeFULL,
genome = GENOME)
}else{print("This message should not be printed, let the maintainer know.")}
t2 <- Sys.time() # Creating data.table time
# NOTE: In practice using too many cores made for longer processing times
# newNCores <- min(nCores, 10)
# Saving file as .rds
saveRDS(object = OUT, file = outName)
# Report #######################################################################
timeBam <- t1 - t0 # Time loading BAM
timePrc <- t2 - t1 # Time processing
timeTot <- t2 - t0 # Total time
reportName <- basename(outName) %>% gsub(x = ., pattern = ".rds$", replacement = ".log", perl = T)
readsInOut <- parallel::mclapply(mc.cores = nCores, txReads, names) %>% unlist()
uniqReadsInOut <- unique(readsInOut)
report <- c("BAM file name:", BAMfile,
"Reads in BAM file:", bamLen,
"Output contains:", " ",
" Number of genes:", length(unique(OUT$gene)),
" Number of reads in output:", length(readsInOut),
" Number of unique reads in output:", length(uniqReadsInOut),
" Fraction of total reads in output:", round(length(uniqReadsInOut)/bamLen, 4),
"Total time taken:", paste(round(timeTot, 2), units(timeTot), sep = " "),
" Loading BAM time:", paste(round(timeBam, 2), units(timeBam), sep = " "),
" Processing time:", paste(round(timePrc, 2), units(timePrc), sep = " ")) %>%
matrix(ncol = 2, byrow =T)
write.table(x = report,
file = reportName,
sep = "\t",
quote = FALSE,
row.names = FALSE,
col.names = FALSE)
if(verb){print("bam2txDT.R finished!")}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.