#' get_contig_coverage(opt = NULL, markduplicates = FALSE, align_as_unpaired_reads = TRUE)
#'
#' JAMSalpha function
#' @export
get_contig_coverage <- function(opt = NULL, markduplicates = FALSE, align_as_unpaired_reads = TRUE){
setwd(opt$workdir)
flog.info("Using kraken2 with JAMStaxtable")
JAMStaxtablefile <- file.path(opt$workingkrakendb, "JAMStaxtable.rda")
if (file.exists(JAMStaxtablefile)){
load(JAMStaxtablefile)
} else {
#Fall back on generic taxonomy table and warn user
flog.info("JAMS taxonomy table not found. Falling back on generic JAMS taxtable.")
data(JAMStaxtable)
}
JAMStaxtable[] <- lapply(JAMStaxtable, as.character)
validtaxlvls <- colnames(JAMStaxtable)[(!(colnames(JAMStaxtable) %in% c("Taxid")))]
#Only get coverage from reads if contigs were actually assembled from reads.
if("readsdir" %in% names(opt)){
#Check if reads are in tmpdir
if(opt$workdir != opt$sampledir){
readsworkdir <- file.path(opt$workdir, "reads")
} else {
readsworkdir <- opt$readsdir
}
setwd(readsworkdir)
#Choose input reads, either NAHS or if not present, trimmed reads.
if ((opt$host == "none") || (opt$analysis %in% c("isolate", "isolaternaseq"))){
inputreads <- opt$trimreads
} else {
inputreads <- opt$nahsreads
}
#Adjust input reads to absolute path
inputreads <- file.path(readsworkdir, inputreads)
#Check the files exist or die.
if (!(all(file.exists(inputreads)))){
flog.info("Could not find input reads which were used for assembly. Aborting now.")
q()
}
flog.info("Creating directory to calculate read coverage.")
opt$covdir <- file.path(opt$workdir, paste(opt$prefix, "coverage", sep="_"))
dir.create(opt$covdir, recursive = TRUE)
setwd(opt$covdir)
#Write a copy of contigs to be analyzed to the system
flog.info("Writing contigs for read alignment.")
write.fasta(sequences = opt$NHcontigs_sequence, names = names(opt$NHcontigs_sequence), nbchar = 80, file.out = "contigscov.fa")
#Copy annotation data
file.copy(opt$bedfile, opt$covdir)
#Set general options
bowtiethreads <- (opt$threads - 2) #Two CPUs must be free on Biowulf
#build index
flog.info("Building index.")
system2('bowtie2-build', args = c("-f", "contigscov.fa", "contigscov"), stdout = TRUE, stderr = TRUE)
#Build command from options present
bowtiecommonargs <- c("-q", "--no-unal", "--threads", bowtiethreads, "-x", "contigscov")
if (opt$seqtype == "illuminamp"){
bowtiecommonargs <- c(bowtiecommonargs, "--rf", "--maxins", "10000")
}
bowtiereadargs <- list("-U", c("-1", "-2"), c("-1", "-2", "-U"))
if (align_as_unpaired_reads){
flog.info("Aligning reads back to contigs as unpaired reads.")
inputreads <- paste0(inputreads, collapse = ",")
}
bowtiereadinput <- as.vector(rbind(bowtiereadargs[[length(inputreads)]], inputreads))
bowtiereadoutput <- c("--un", (paste(opt$prefix, "NAss_SE.fastq", sep="_")), "--un-conc", (paste(opt$prefix, "NAss_PE.fastq", sep="_")))[1:(2 * min(length(inputreads), 2))]
samfile <- paste(paste(opt$prefix, "contigs", sep = "_"), "sam", sep = ".")
bowtiesam <- c("-S", samfile)
bowtieargs <- c(bowtiecommonargs, bowtiereadinput, bowtiereadoutput, bowtiesam)
flog.info("Aligning reads back to contigs.")
opt$bowtie_cov_output <- system2('bowtie2', args = bowtieargs, stdout = TRUE, stderr = TRUE)
flog.info(paste("Alignment of reads to contigs is complete with", opt$bowtie_cov_output[length(opt$bowtie_cov_output)]))
NAssfastqs <- list.files(pattern = "*fastq")
if (opt$classifyunassembled == TRUE){
flog.info("Classifying taxonomy of non assembled reads.")
makefastacmd <- paste(file.path(opt$bindir, "fastq2fasta4kraken.sh"), paste(NAssfastqs, collapse = " "))
system(makefastacmd)
flog.info(paste("Counting k-mers of unclassified reads with kraken2 and using confidence score", as.character(opt$krakenconfidencescore),". Please be patient..."))
krakenargs <- c("--db", opt$workingkrakendb, "--threads", opt$threads, "--confidence", opt$krakenconfidencescore, "NAss.fasta")
kraken2taxid <- system2("kraken2", args=krakenargs, stdout = TRUE, stderr = FALSE)
#Split taxidlist into chunks for speedier de-duplication of repeated node taxids
chunksize = 5000
chunk2 <- function(x,n){ split(x, cut(seq_along(x), n, labels = FALSE)) }
chunkcoords <- chunk2(1:length(kraken2taxid), (length(kraken2taxid) / chunksize))
numchunks <- length(chunkcoords)
flog.info(paste("Spliting reads taxonomic table into", numchunks, "chunks of", chunksize, "reads each for parallel processing. Depending on the number of reads this may take a while."))
split_to_df <- function(kraken2taxid = NULL){
kraken2taxid <- strsplit(kraken2taxid, split = "[\t]", fixed = FALSE)
k2out <- plyr::ldply(kraken2taxid, rbind)
colnames(k2out) <- c("ClassFlag", "Sequence", "Taxid", "Length", "kmers")
k2out[] <- lapply(k2out, as.character)
k2out$Length <- as.numeric(k2out$Length)
return(k2out)
}
#Cap the number of CPUs to 32 because of memory issues
appropriatenumcores <- min(max(1, (opt$threads - 2)), 32)
k2outlist <- mclapply(1:numchunks, function (x) { split_to_df(kraken2taxid[chunkcoords[[x]]]) }, mc.cores = appropriatenumcores)
k2out <- plyr::ldply(k2outlist, rbind)
k2out <- as.data.frame(k2out)
NAssdata <- k2out[,c("Sequence", "Length", "Taxid")]
NAssdata <- suppressMessages(left_join(NAssdata, JAMStaxtable))
#Temporarily, fill in taxids which are NOT in the database with unclassifieds
unclassdf <- JAMStaxtable[which(JAMStaxtable$Taxid == 0), 2:ncol(JAMStaxtable)]
NAssdata[which(is.na(NAssdata$Domain == TRUE)), colnames(unclassdf)] <- unname(unclassdf)
oriNAsslength <- sum(NAssdata$Length)
hostlength <- sum(subset(NAssdata, Kingdom == "k__Metazoa")[]$Length)
#Filter out any vertebrate DNA if existant
NAssdata <- subset(NAssdata, Kingdom != "k__Metazoa")
#Report host reads were found, if applicable
if (hostlength > 0){
hostpct <- round((hostlength / oriNAsslength) * 100, 2)
flog.info(paste("Of the non-assebmled reads", hostpct, "% were classified as k__Metazoa (host) DNA and were discarded."))
}
#Eliminate redundancy and get the dose
NAssdose <- NAssdata %>% dplyr::group_by(Taxid) %>% dplyr::summarise(NumBases = sum(Length))
NAssdose <- left_join(NAssdose, JAMStaxtable)
NAssdose[which(is.na(NAssdose$Domain == TRUE)), colnames(unclassdf)] <- unname(unclassdf)
opt$NAssdose <- as.data.frame(NAssdose)
} else {
flog.info("Unassembled reads will not be classified and will be discarded.")
}#End conditional for classifying unassembled reads
#index
flog.info("Indexing and sorting alignment.")
system2('samtools', args = c("faidx", "contigscov.fa"), stdout = TRUE, stderr = TRUE)
#convert to bam & sort
bamfile <- paste(paste(opt$prefix, "contigs", sep="_"), "bam", sep=".")
sortedbamfile <- paste(paste(opt$prefix, "contigs", sep="_"), "sorted", "bam", sep=".")
sortedmarkdupbamfile <- paste(paste(opt$prefix, "contigs", sep="_"), "sorted", "markdup", "bam", sep=".")
system2('samtools', args = c("view", "--threads", opt$threads, "-q", "10", "-F", "4", "-bt", "contigscov.fa.fai", samfile, ">", bamfile), stdout = TRUE, stderr = TRUE)
system(paste("samtools", "sort", "--threads", opt$threads, bamfile, ">", sortedbamfile))
system(paste("samtools", "index", sortedbamfile))
#mark duplicates with picard
if (markduplicates == TRUE){
flog.info("Removing duplicate reads from alignment.")
sortedmarkdupbamfile <- paste(paste(opt$prefix, "contigs", sep = "_"), "sorted", "markdup", "bam", sep = ".")
sortedmarkdupsortedbamfile <- paste(paste(opt$prefix, "contigs", sep = "_"), "sorted", "markdup", "sorted", "bam", sep = ".")
sortedmarkdupsortedflagstatfile <- paste(paste(opt$prefix, "contigs", sep = "_"), "sorted", "markdup", "sorted", "flagstat", sep = ".")
sortedmarkdupsortedstatsfile <- paste(paste(opt$prefix, "contigs", sep = "_"), "sorted", "markdup", "sorted", "bam", "stats", sep = ".")
picardcmd <- paste0("picard MarkDuplicates INPUT=", sortedbamfile, " OUTPUT=", sortedmarkdupbamfile, " METRICS_FILE=", paste(sortedmarkdupbamfile, "metrics", sep="."), " AS=TRUE VALIDATION_STRINGENCY=LENIENT MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 REMOVE_DUPLICATES=TRUE READ_NAME_REGEX=null")
system(picardcmd)
system(paste("samtools", "sort", "--threads", opt$threads, sortedmarkdupbamfile, ">", sortedmarkdupsortedbamfile))
system(paste("samtools", "index", sortedmarkdupsortedbamfile))
system(paste("samtools", "flagstat", sortedmarkdupsortedbamfile, ">", sortedmarkdupsortedflagstatfile))
system(paste("samtools", "stats", sortedmarkdupsortedbamfile, ">", sortedmarkdupsortedstatsfile))
coveragebamfile <- sortedmarkdupsortedbamfile
} else {
coveragebamfile <- sortedbamfile
}
covdflist <- list()
#Calculating contig base coverage
flog.info("Calculating base coverage depth in contigs.")
coverageargs <- c("-ibam", coveragebamfile, "-d", ">", "perbasecoverage.map")
system2('genomeCoverageBed', args = coverageargs, stdout = TRUE, stderr = TRUE)
covdflist$contigcoverage <- fread(file = "perbasecoverage.map", sep = "\t", header = FALSE, nThread = opt$threads)
colnames(covdflist$contigcoverage) <- c("Feature", "Position", "Depth")
covdflist$contigcoverage <- covdflist$contigcoverage[ , c("Feature", "Depth")]
#Calculating feature coverage
flog.info("Calculating coverage depth for features.")
sortedbed <- paste0(opt$prefix, ".sorted.markdup.sorted.bed")
bedcmd <- paste0("bamToBed -i ", coveragebamfile, " | sort --temporary-directory=", opt$workdir, " -k1,1 -k2,2n > ", sortedbed)
system(bedcmd)
bedfeatargs <- c("coverage", "-hist", "-sorted", "-b", sortedbed, "-a", opt$bedfile, ">", "featuredep.tsv")
system2('bedtools', args = bedfeatargs, stdout = TRUE, stderr = FALSE)
covdflist$featuredep <- fread(file = "featuredep.tsv", sep = "\t", stringsAsFactors = FALSE, fill = TRUE, nThread = opt$threads)
colnames(covdflist$featuredep) <- c("Contig", "Start", "End", "Feature", "MapQual", "Strand", "Annotby", "FeatType", "Spin", "Annot", "Depth", "PartNumBasesofFeature", "LengthofFeat", "PctofAatdep")
covdflist$featuredep <- subset(covdflist$featuredep, FeatType %in% c("CDS", "tRNA", "rRNA", "tmRNA"))
covdflist$featuredep$PartDepthofFeature <- (covdflist$featuredep$Depth * covdflist$featuredep$PartNumBasesofFeature)
covdflist$featuredep <- covdflist$featuredep[ , c("Feature", "PartDepthofFeature")]
colnames(covdflist$featuredep) <- c("Feature", "Depth")
#Aggregate in parallel
sum_up_bases <- function(df = NULL){
aggregate_df <- df[ , list(NumBases = sum(Depth)), by = "Feature"]
return(aggregate_df)
}
if (opt$threads > 3){
aggregate_df_list <- mclapply(names(covdflist), function(x) { sum_up_bases(df = covdflist[[x]])}, mc.cores = 4)
} else {
aggregate_df_list <- lapply(names(covdflist), function(x) { sum_up_bases(df = covdflist[[x]])})
}
names(aggregate_df_list) <- names(covdflist)
colnames(aggregate_df_list$contigcoverage) <- c("Contig", "NumBases")
opt$contigsdata <- left_join(opt$contigsdata, as.data.frame(aggregate_df_list$contigcoverage), by = "Contig")
opt$featuredata <- left_join(opt$featuredata, as.data.frame(aggregate_df_list$featuredep), by = "Feature")
opt$contigsdata$NumBases <- as.numeric(opt$contigsdata$NumBases)
opt$featuredata$NumBases <- as.numeric(opt$featuredata$NumBases)
#opt$contigperbasecoverage <- shrink_perbasecoverage(perbasecoverage = contigcoverage, percentage = 2)
#Consolidate LKT dose
if (opt$classifyunassembled == TRUE){
#Eliminate redundancy and get the dose
tmpcontigsdata <- opt$contigsdata[ , (!(colnames(opt$contigsdata) %in% c("Length", "GC")))]
colnames(tmpcontigsdata)[which(colnames(tmpcontigsdata) == "Contig")] <- "Sequence"
#Where is the dose coming from (contigs, reads or both)
contigsandNAssLKTs <- unique(tmpcontigsdata$LKT)[unique(tmpcontigsdata$LKT) %in% unique(opt$NAssdose$LKT)]
onlycontigsLKTs <- unique(tmpcontigsdata$LKT)[!(unique(tmpcontigsdata$LKT) %in% unique(opt$NAssdose$LKT))]
onlyNAssLKTs <- unique(opt$NAssdose$LKT)[!(unique(opt$NAssdose$LKT) %in% unique(tmpcontigsdata$LKT))]
fromCtgPct <- data.frame(LKT = unique(c(contigsandNAssLKTs, onlycontigsLKTs, onlyNAssLKTs)), PctFromCtg = rep(0, length(unique(c(contigsandNAssLKTs, onlycontigsLKTs, onlyNAssLKTs)))))
fromCtgPct$PctFromCtg[which(fromCtgPct$LKT %in% onlycontigsLKTs)] <- 100
for (t in 1:length(contigsandNAssLKTs)){
fromCtg <- sum(tmpcontigsdata$NumBases[which(tmpcontigsdata$LKT == contigsandNAssLKTs[t])])
fromNAss <- sum(opt$NAssdose$NumBases[which(opt$NAssdose$LKT == contigsandNAssLKTs[t])])
fromCtgPct$PctFromCtg[which(fromCtgPct$LKT == contigsandNAssLKTs[t])] <- round((fromCtg / (fromCtg + fromNAss)) * 100, 2)
}
tmpcontigsdata$Sequence <- NULL
tmpalldata <- rbind(tmpcontigsdata, opt$NAssdose)
} else {
flog.info("Unassembled read taxonomic relative abundance will not be taken into account.")
tmpalldata <- opt$contigsdata
fromCtgPct <- data.frame(LKT = unique(tmpalldata$LKT), PctFromCtg = rep(100.00, length(unique(tmpalldata$LKT))))
} #End conditional if unassembled reads relative abundance should be added to LKTdose
} else {
flog.info("Short reads were not used for contig assembly, so sequencing depth will be set at 1X for all contigs.")
#if there are no reads, assume depth of 1, ie just repeat length of contig as NumBases.
opt$contigsdata$NumBases <- opt$contigsdata$Length
opt$featuredata$NumBases <- opt$featuredata$LengthDNA
tmpalldata <- opt$contigsdata
fromCtgPct <- data.frame(LKT = unique(tmpalldata$LKT), PctFromCtg = rep(100.00, length(unique(tmpalldata$LKT))))
} #End conditional get doses from short read alignment
#Compute dose of Last Known Taxa, but maintain kraken1 backwards conmpatibility
flog.info("Computing dose of Last Known Taxa.")
totaldose <- aggregate(NumBases ~ LKT, FUN = sum, data = tmpalldata)
tmpalldata$NumBases <- NULL
tmpalldata$Sequence <- NULL
tmpalldata$Taxid <- NULL
tmpalldata$Contig <- NULL
tmpalldata$IS1 <- NULL #Cannot left join to something which is more discriminatory
#de-duplicate up to LKT. Amazingly, there are some alternative taxonomy lineages for the same species... Crikey.
tmpalldata <- tmpalldata[which(!(duplicated(tmpalldata$LKT))), ]
totaldose <- left_join(totaldose, tmpalldata, by = "LKT")
totaldose <- left_join(totaldose, fromCtgPct, by = "LKT")
totaldose$PctFromCtg[which(is.na(totaldose$PctFromCtg))] <- 0 #Just being sure
opt$LKTdose <- totaldose[ , c((validtaxlvls[!(validtaxlvls %in% "IS1")]), "NumBases", "PctFromCtg")]
opt$LKTdose <- opt$LKTdose[order(-opt$LKTdose$NumBases), ]
opt$LKTdose$CumSum <- cumsum(opt$LKTdose$NumBases)
opt$LKTdose$PctCumSum <- round(opt$LKTdose$CumSum / sum(opt$LKTdose$NumBases) * 100, 2)
#Data should be non-empty and non-redundant, but just make sure
opt$LKTdose[is.na(opt$LKTdose)] <- 0
#If there is assemblydata available, add that to LKTdose dataframe
if ("assemblystats" %in% names(opt)){
LKTassemblies <- subset(opt$assemblystats, TaxLevel == "LKT")
LKTassemblies$TaxLevel <- NULL
colnames(LKTassemblies)[which(colnames(LKTassemblies) == "Taxon")] <- "LKT"
#join this information onto LKTdose
opt$LKTdose <- left_join(opt$LKTdose, LKTassemblies, by = "LKT")
opt$LKTdose[is.na(opt$LKTdose)] <- 0
}
#Do MetaBAT if applicable
if (opt$bin_contigs_with_metabat){
setwd(opt$covdir)
flog.info("Binning contigs from assembly with metaBAT2")
system(paste("jgi_summarize_bam_contig_depths --outputDepth depth.txt", sortedbamfile))
system("metabat2 -i contigscov.fa -a depth.txt -o bins_dir/MAGbin")
flog.info("Integrating binning information into JAMS")
MAGbin_files <- list.files("bins_dir", pattern = "fa$")
get_contigs_in_bin <- function(bin_file = NULL){
bin_contigs <- NULL
if (file.exists(file.path("bins_dir", bin_file))){
bin_contigs <- system2("grep", args = c("\'^>\'", file.path("bins_dir", bin_file)), stdout = TRUE, stderr = FALSE)
bin_contigs <- gsub("^>", "", bin_contigs)
}
if (length(bin_contigs) < 1){
flog.warn(paste("MetaBAT2 bin file", bin_file, "not found, or empty."))
}
return(bin_contigs)
}
bin_contigs_list <- lapply(MAGbin_files, function(x) { get_contigs_in_bin(bin_file = x) } )
names(bin_contigs_list) <- MAGbin_files
opt$contigsdata$MetaBATbin <- 0
for (mbl in 1:length(bin_contigs_list)){
bin_number <- as.numeric(unlist(strsplit(names(bin_contigs_list)[mbl], split = "\\."))[2])
curr_contigs <- bin_contigs_list[[mbl]]
opt$contigsdata[which(opt$contigsdata$Contig %in% curr_contigs), "MetaBATbin"] <- bin_number
}
#Standardise number of charaters
opt$contigsdata$MetaBATbin <- formatC(opt$contigsdata$MetaBATbin, width = nchar(max(opt$contigsdata$MetaBATbin)), flag = "0")
opt$contigsdata$MetaBATbin[which(as.numeric(opt$contigsdata$MetaBATbin) == 0)] <- "none"
opt$contigsdata$MetaBATbin[which(opt$contigsdata$MetaBATbin != "none")] <- paste(opt$prefix, "MAGbin", opt$contigsdata$MetaBATbin[which(opt$contigsdata$MetaBATbin != "none")], sep = "_")
opt$MAGdose <- aggregate(NumBases ~ MetaBATbin, FUN = sum, data = opt$contigsdata)
#Get assemblystats for MAGs
if (length(unique(opt$contigsdata$MetaBATbin[which(opt$contigsdata$MetaBATbin != "none")])) > 0){
opt <- evaluate_metaBAT_bin(opt = opt)
}
}
return(opt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.