#' Data processing - paired ends
#'
#' \code{data.proc.paired} is a first attempt to develop a function to process
#' (quality checking, error and chimeras filtering) paired end reads from a NGS
#' run after these have been deconvoluted.
#'
#' \code{data.proc.paired} locates the .fastq files (can be compressed) in the
#' directory indicated in \code{dir.in}. Because the location of the fwd and rev
#' reads need to be provided it will fail if this argument is missing
#'
#' This is the first implementation to deal with paired-end reads and assumes
#' that adapters, primers
#' and indexes have been already removed and that each file represents a sample.
#' Because of this, this function cannot be run in the wrapper \code{raw2data.proc}.
#'
#' Also, it is assumed that the file names follow the convention where "R1" and
#' "R2" identify forward and reverse reads (with the root of the file being identical).
#'
#' The \code{data.proc.paired} pipeline is as follows: fastq files are read in. A
#' filter is applied to each pair that truncates reads at the first instance of
#' a quality score
#' less than \code{truncQ}, remove reads that are of low quality (currently the
#' threshold is hard-coded and reads are discarded if the expected errors is
#' higher than 3 - from documentation in the R package \code{dada2}, the
#' expected errors are calculated from the nominal definition of the quality
#' score: EE = sum(10^(-Q/10)) - and remove reads that (after truncation) do not
#' match the target length. A quality report can be (optionally) generated with the
#' R package \code{ShortReads} to verify the quality of the reads retained
#' after this step. Reads are then dereplicated. Optionally, the dada (Callahan
#' et al 2015) algorithm is applied and bimeras are searched and removed with
#' default settings of the relative functions in the package \code{dada2}. The
#' sequences that were retained at completion of \code{data.proc.paired} are saved in
#' fasta files in the subfolder "Final_seqs" and a .csv with a summary of the
#' number of reads that have been retained in each step is also written. These
#' two outputs are also returned at the end of the function.
#'
#' The sequence data handling is done by using functionalities from the packages
#' \code{dada2} and \code{ShortRead}, so make sure to cite them (in addition to
#' \code{amplicR} of course!) if you report your results in a paper or report.
#'
#' @param dir.in The directory where the fastq files are located. It needs a
#' vector of length=2
#' @param dir.out The directory where to save the results. If NULL (default)
#' then \code{dir.out <- dir.in}
#' @param bp An integer indicating the expected length (base-pairs) of the
#' reads. If zero (default) no truncation is applied
#' @param truncQ Truncate reads at the first instance of a quality score less
#' than or equal to truncQ when conducting quality filtering. See
#' \code{\link[dada2]{fastqFilter}} for details
#' @param qrep Logical. Should the quality report be generated? (default
#' \code{FALSE})
#' @param dada Logical. Should the dada analysis be conducted? (default
#' \code{TRUE})
#' @param pool Logical. Should samples be pooled together prior to sample
#' inference? (default \code{FALSE}). See \code{\link[dada2]{dada}} for
#' details
#' @param plot.err Logical. Whether error rates obtained from \code{dada} should
#' be plotted
#' @param chim Logical. Should the bimera search and removal be performed?
#' (default \code{TRUE})
#' @param minOverlap When merging paired ends, what is the minimum of basis that
#' should overlap
#' @param maxMismatch When merging paired ends, the maximum number of allowed mismatches
#' @param {trimOverhang} {When merging paired ends, whether the overhanging
#' extreme of the sequence should be trimmed off}
#' @param orderBy Character vector specifying how the returned sequence table
#' should be sorted. Default "abundance". See
#' \code{\link[dada2]{makeSequenceTable}} for details
#' @param verbose Logical. Whether information on progress should be outputted
#' (default: TRUE)
#' @return Return a list with several elements:
#'
#' \itemize{ \item $luniseqsFinal: A list with unique sequences (names) that
#' were retained at completion of \code{data.proc} and their abundance
#' (values). \item $lsummary: A list where each element is a summary of the
#' number of reads that were retained in each step. This can be converted in a
#' \code{data.frame} by running the following
#'
#' \code{summary <- plyr::join_all(lsummary, by="Sample", type="left")} \item
#' $stable: The sequence table \item $seq_list: The sequences and matching
#' sequence IDs \item $call: The function call}
#'
#' Several files are also returned, these include: \itemize{ \item
#' Seq_table.csv Sequence table \item Seq_list.csv List of sequences and their
#' matching IDs \item data.proc.summary.csv A summary of the number of reads
#' that were retained in each step \item data.proc.rda R data file containing
#' the list returned by data.proc (see above) }
#'
#' @references Benjamin J Callahan, Paul J McMurdie, Michael J Rosen, Andrew W
#' Han, Amy J Johnson, Susan P Holmes (2015). DADA2: High resolution sample
#' inference from amplicon data.
#' @import ggplot2
#' @import data.table
#' @seealso \code{\link[dada2]{dada}}, \code{\link[dada2]{makeSequenceTable}}
#' @export
#' @examples
#' # Select the directory where the example data are stored
#' example.data <- system.file("extdata", "HTJ", package="amplicR")
#' # Select a temporary directory where to store the outputs
#' out <- tempdir()
#'
#' HTJ.test <- data.proc(example.data, out, bp=140)
#'
#' # To clean up the temp directory
#' unlink(out, recursive=TRUE)
data.proc.paired <- function(dir.in=NULL, dir.out=NULL, bp=0, truncQ=2, qrep=FALSE,
dada=TRUE, pool=FALSE, plot.err=FALSE, chim=TRUE,
minOverlap = 12,
maxMismatch = 0,
trimOverhang = FALSE,
orderBy="abundance", paired = TRUE, verbose=TRUE) {
#----------------------------------------------------------------------------#
if (!requireNamespace("dada2", quietly = TRUE)) {
stop("Package 'dada2' needed for this function to work. Please install it
either manually or using the function amplicR::setup().",
call. = FALSE)
}
#----------------------------------------------------------------------------#
# Helper functions
#----------------------------------------------------------------------------#
# pass dada list, extracts unique reads with dada2::getUniques() and returns
# only those that are not chimeras.
# NOTE: the same process can be applied to dada elements (for example data.frame)
# to maintain additional info if needed later on
rm.chim<-function(i, dada_el, chim_el) {
no_chim <- dada2::getUniques(dada_el[[i]])[!chim_el[[i]]]
return(no_chim)
}
nChim <- function(chim_el) {
nSeqs <- sum(chim_el)
return(nSeqs)
}
dp_extract <- function(nm, derep) {
seqs <- dada2::getUniques(derep[[nm]])
return(seqs)
}
# Check if there are reads in there
checkNreads <- function(fastqFiles, reg="R1") {
f <- vector("list", length = length(fastqFiles[grep(reg, x = fastqFiles)]))
y <- vector("list", length = length(fastqFiles[grep(reg, x = fastqFiles)]))
for(i in seq_along(fastqFiles)) {
f[[i]] <- FastqStreamer(fastqFiles[i] , n=50
)
y[[i]] <- yield(f[[i]])
#qt <- dada2:::qtables2(y[[i]])
#length(y[[i]])
close(f[[i]])
}
lgt2 <- sapply(y, length)
return(lgt2)
}
#----------------------------------------------------------------------------#
call <- sys.call(1)
# if(is.null(dir.in)) {
# dir.in <- choose.dir(caption="Please, select the directory where the fastq
# files are located")
# }
#if(paired == TRUE)
if(length(dir.in) !=2)
stop("It seems you are trying to process paired end reads but you didn't provide two paths for dir.in")
if(is.null(dir.out)) {
#if(paired == TRUE) {
stop("dir.out must be provided when prosessing paired end reads")
#} else {
# dir.out <- dir.in
}
filt_fold <- "Filtered_seqs"
dir.create(file.path(dir.out, filt_fold), showWarnings=FALSE, recursive=TRUE)
fns <- vector("list", length = length(dir.in))
fastqs <- vector("list", length = length(dir.in))
filtRs <- vector("list", length = length(dir.in))
for(d in seq_along(dir.in)) {
fns[[d]] <- list.files(path=dir.in[d])
fastqs[[d]] <- fns[[d]][grepl(".fastq.{,3}$", fns[[d]])]
if(length(fastqs[[d]]) == 0) stop(paste("There are no files in", dir.in[d],
"with either fastq or fastq.gz extension"))
}
if(length(fastqs[[1]]) == length(fastqs[[2]])) {
n <- length(fastqs[[1]])
message(paste("\nFound", n, "fasta files in each input dir.in folders:"))
message(paste("\n", dir.in, collapse="\n"))
} else {
warnings(paste("Different number of fasta files in the two", dir.in, "folders"))
warnings("Retaining only the files with the same names in the two folders")
its <- intersect(fastqs[[1]], fastqs[[2]])
fastqs[[1]] <- its
fastqs[[2]] <- its
n <- length(fastqs[[1]])
}
for(d in seq_along(dir.in)) {
filtRs[[d]] <- paste(dir.out,
filt_fold,
sapply(fastqs[[d]],
sub,
pattern="\\.fastq.{,3}$",
replacement=paste0("R", d, "_filt.fastq.gz")),
sep="/"
)
}
#### Filter ####
sample_names <- sub("_Ind_primerRm\\.fastq.{,3}$", "", fns[[1]])
fastqsPairs <- mapply(c, file.path(dir.in[1], fastqs[[1]]),
file.path(dir.in[2], fastqs[[2]]))
filtRsPairs <- mapply(c, filtRs[[1]], filtRs[[2]])
for(i in seq_len(n)) {
suppressWarnings(dada2::fastqPairedFilter(fastqsPairs[, i], matchIDs = TRUE,
filtRsPairs[, i],
maxN=0,
maxEE=3,
truncQ=truncQ,
truncLen=bp,
compress=TRUE,
verbose=FALSE))
}
filtRs <- list.files(path=file.path(dir.out, filt_fold), full.names=TRUE)
# sample_names_fil <- unique(sub("_Ind_primerRmR[1-2]_filt.fastq.gz", "",
# sapply( filtRs, basename, USE.NAMES=FALSE)))
if(qrep == TRUE) browseURL(report(qa(filtRs)))
filteredReadsF <- filtRs[grep("R1", x = filtRs)]
filteredReadsR <- filtRs[grep("R2", x = filtRs)]
# check that all reads have length>0
lgtF <- checkNreads(filteredReadsF)
filteredReadsF <- filteredReadsF[lgtF > 0]
lgtR <- checkNreads(filteredReadsR, reg="R2")
filteredReadsR <- filteredReadsR[lgtR > 0]
sample_names_filR1 <- unique(sub("_Ind_primerRmR1_filt.fastq.gz", "",
sapply( filteredReadsF, basename, USE.NAMES=FALSE)))
sample_names_filR2 <- unique(sub("_Ind_primerRmR2_filt.fastq.gz", "",
sapply( filteredReadsR, basename, USE.NAMES=FALSE)))
if(!identical(sample_names_filR1, sample_names_filR2)) {
sample_names_fil <- intersect(sample_names_filR1, sample_names_filR2)
filteredReadsF <- filteredReadsF[match(sample_names_fil, sample_names_filR1)]
filteredReadsR <- filteredReadsR[match(sample_names_fil, sample_names_filR2)]
} else {
sample_names_fil <- sample_names_filR1
}
#### Dereplicate ####
# Fwd
derepReadsF <- lapply(filteredReadsF, dada2::derepFastq, verbose=FALSE)
names(derepReadsF) <- sample_names_fil
fnSeqsF <- unlist(lapply(derepReadsF, getnFiltered))
if(length(fnSeqsF) == 0) stop(paste("There are no sequences that passed the filter in",
file.path(dir.out, filt_fold)))
unSeqsF <- unlist(lapply(derepReadsF, getnUniques))
# Rev
derepReadsR <- lapply(filteredReadsR, dada2::derepFastq, verbose=FALSE)
names(derepReadsR) <- sample_names_fil
fnSeqsR <- unlist(lapply(derepReadsR, getnFiltered))
if(length(fnSeqsR) == 0) stop(paste("There are no sequences that passed the filter in",
file.path(dir.out, filt_fold)))
unSeqsR <- unlist(lapply(derepReadsR, getnUniques))
# summary
lsummary <- list()
lsummary[[1]] <- merge(data.table(Sample=sample_names),
data.table(Sample=sample_names_fil,
nFilteredF=fnSeqsF,
nFilteredR=fnSeqsR),
all.x=TRUE,
by="Sample")
derepReads <- list(derepReadsF, derepReadsR)
lsummary[[2]] <- data.frame(Sample=sample_names_fil, nDerepF=unSeqsF, nDerepR=unSeqsR)
el <- 2
#### dada ####
if(dada == TRUE) {
dadaReads <- vector("list", length = 2)
nDenoised <- vector("list", length = 2)
lda <- vector("list", length = 2)
for(d in 1:2) {
dadaReads[[d]] <- dada2::dada(derepReads[[d]], err=dada2::inflateErr(dada2::tperr1,3),
errorEstimationFunction=dada2::loessErrfun,
selfConsist=TRUE, pool=pool)
if(plot.err == TRUE) {
pdf(file = file.path(dir.out, paste0("R", d, "_Plot_ErrorRates.pdf")))
if(length(derepReads[[d]]) > 1) {
for (i in seq_along(dadaReads[[d]])) {
p <- dada2::plotErrors(dadaReads[[d]][[i]], nominalQ=TRUE)
print(p + ggtitle(names(dadaReads[[d]][i])))
}
} else {
p <- dada2::plotErrors(dadaReads[[d]], nominalQ=TRUE)
print(p + ggtitle(names(derepReads[[d]])))
}
dev.off()
}
if(length(derepReads[[d]]) > 1) {
nDenoised[[d]] <- sapply(dadaReads[[d]], ndada)
lda[[d]] <- lapply(dadaReads[[d]], dada2::getUniques)
} else {
nDenoised[[d]] <- length(dada2::getUniques(dadaReads[[d]]))
names(nDenoised[[d]]) <- names(derepReads[[d]])
lda[[d]] <- list(dada2::getUniques(dadaReads[[d]]))
names(lda[[d]]) <- names(derepReads[[d]])
}
}
el <- el + 1
lsummary[[el]] <- data.frame(Sample=names(nDenoised[[1]]),
nDenoisedF=nDenoised[[1]],
nDenoisedR=nDenoised[[2]])
#### Merge ####
mergePairedEnds <- mergePairs(dadaF = dadaReads[[1]], derepF = derepReads[[1]],
dadaR = dadaReads[[2]], derepR = derepReads[[2]],
minOverlap = 12,
maxMismatch = 0,
returnRejects = FALSE,
propagateCol = character(0),
justConcatenate = FALSE,
trimOverhang = FALSE,
verbose = verbose)
el <- el + 1
lsummary[[el]] <- data.frame(Sample=names(mergePairedEnds),
nMerged=sapply(mergePairedEnds, nrow))
}
if(verbose) {
message("Number of sequenced retained after sample inference with dada2
(Callahan et al 2015):")
message(cat(nDenoised, "\n"))
}
#### Bimera ####
if(chim == TRUE) {
if(dada == TRUE) {
#uniqSeqs <- mergePairedEnds$abundance
#names(uniqSeqs) <- mergePairedEnds$sequence
if(length(mergePairedEnds) > 1) {
single <- FALSE
bimReads <- lapply(mergePairedEnds, dada2::isBimeraDenovo, verbose=FALSE)
#names(bimReads) <- names(mergePairedEnds)
} else {
single <- TRUE
bimReads <- dada2::isBimeraDenovo(mergePairedEnds, verbose=FALSE)
}
} else {
if(length(derepReadsF) > 1) {
single <- FALSE
} else {
single <- TRUE
}
warning("The data.proc.paired function is not fully developed to summarise results
when dada == FALSE and results are reported only for the Forward reads")
bimReads <- lapply(derepReadsF, dada2::isBimeraDenovo, verbose=FALSE)
names(bimReads) <- names(derepReadsF)
}
if(verbose) {
message("Proportion of bimeras found in each sample")
message(cat(round(sapply(bimReads, mean), digits=2), "\n"))
}
nChimeras <- sapply(bimReads, nChim)
if(verbose) {
message("Number of bimeras found in each sample:")
message(cat(nChimeras, "\n"))
}
el <- el + 1
lsummary[[el]] <- data.frame(Sample=names(nChimeras), nChimeras)
if(dada == TRUE) {
if(single) {
no_chim <- list(dada2::getUniques(mergePairedEnds)[!bimReads])
names(no_chim) <- names(mergePairedEnds)
} else {
no_chim <- lapply(seq_along(mergePairedEnds), rm.chim, mergePairedEnds, bimReads)
names(no_chim) <- names(bimReads)
}
} else {
no_chim <- lapply(seq_along(mergePairedEnds), rm.chim, mergePairedEnds, bimReads)
names(no_chim) <- names(bimReads)
}
}
#### Reporting ####
if(chim == TRUE) {
luniseqsFinal <- no_chim
nSeq <- sapply(luniseqsFinal, length)
} else {
if(dada == TRUE) {
luniseqsFinal <- lda
nSeq <- sapply(luniseqsFinal, length)
} else {
luniseqsFinal <- derepReads
nSeq <- unSeqs
}
}
zeros <- which( nSeq == 0)
if(length(zeros > 0)) {
warning(paste("No sequences passed the analysis for sample(s):",
names(luniseqsFinal)[zeros], "in", dir.out, "\n", sep = "\n"))
luniseqsFinal <- luniseqsFinal[-zeros]
}
stable <- dada2::makeSequenceTable(luniseqsFinal, orderBy=orderBy)
seqs <- colnames(stable)
seq_names <- paste0("seq", 1:dim(stable)[2])
colnames(stable) <- seq_names
seq_list <- data.frame(seq_names, sequence=seqs)
write.csv(stable, file=paste(dir.out, "Seq_table.csv", sep="/"))
write.csv(seq_list, file=paste(dir.out, "Seq_list.csv", sep="/"), row.names=FALSE)
el <- el + 1
lsummary[[el]] <- data.frame(Sample=names(nSeq), nSeq)
summary <- plyr::join_all(lsummary, by="Sample", type="left")
write.csv(summary, file=paste(dir.out, "data.proc.summary.csv", sep="/"),
row.names=FALSE)
fasta_fold <- "Final_seqs"
fasta_dir <- file.path(dir.out, fasta_fold)
table2fasta(stable, seq.list=seq_list, dir.out=fasta_dir, verbose)
dproc <- list(luniseqsFinal=luniseqsFinal,
lsummary=lsummary,
stable=stable,
seq_list=seq_list,
call=call)
save(dproc, file=paste(dir.out, "data.proc.rda", sep="/"))
return(dproc)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.