#' Runs a custom DADA2 workflow
#' @param primerless_reads_directory The directory containing paired end reads with no primers
#' @param reference_taxonomy_database_path The path of the desired reference taxonomy database
#' @param overlap_mismatch_proportion A proportion for the number of allowed mismatches based on overlap length between paired reads
#' @param taxonomy_confidence_threshold A percentage for the confidence threshold (default 0) for taxonomic assignment
#' @export
run_dada2 <- function(primerless_reads_directory, reference_taxonomy_database_path,
overlap_mismatch_proportion = 0.015, taxonomy_confidence_threshold = 0) {
path <- primerless_reads_directory
db_path <- reference_taxonomy_database_path
list.files(path)
# Extracting directory + filenames of primerless sequences
cutFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE))
cutRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE))
# Extract sample names (remove path & end of sequence name, "L001_R1_001.fastq")
get.sample.name <- function(fname) strsplit(basename(fname), "_L001")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))
# Plot forward and reverse quality profiles (get an idea of how our reads look)
#jpeg('cutFs_quality_profile.jpg') ; plotQualityProfile(cutFs[1:2]) ; dev.off()
#jpeg('cutRs_quality_profile.jpg') ; plotQualityProfile(cutRs[1:2]) ; dev.off() #Quality of reverse reads crashes quickly
# Create a vector of file paths + names to where the filtered sequences will be stored
filtFs <- file.path(path, "filtered", basename(cutFs))
filtRs <- file.path(path, "filtered", basename(cutRs))
saveRDS(filtFs, "filtFs.rds") ; saveRDS(filtRs, "filtRs.rds")
filtFs <- readRDS("filtFs.rds") ; filtRs <- readRDS("filtRs.rds")
# Conduct quality filtering
# maxN: Set maximum number of ambiguous bases to 0 (should already be 0 from primer removal step)
# maxEE: vector containing the maximum expected errors for the forward and reverse read of the sample
# rm.phix: check for PhiX sequences and remove if found
# compress: store filtered sequences in compressed format to save space
# truncQ: truncate the remainder of a sequence at the first occurrence of quality score 2 or lower
# minLen: reads should be at least 200 base pairs
# 1e4: the number of reads to filter at a time, lowered so it can run on a laptop
out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN=0, maxEE=c(2,5),
rm.phix=T, compress=T, multithread=T, truncQ=2,minLen=100)
saveRDS(out, "out.rds")
out <- readRDS("out.rds")
# Construct error models (mostly defaults)
# MAX_CONSIST: the number of rounds in self-consistency loop
# to reach error convergence, increased from 10 to 15
errF <- learnErrors(filtFs, multithread = TRUE, MAX_CONSIST=15)
errR <- learnErrors(filtRs, multithread = TRUE, MAX_CONSIST=15)
saveRDS(errF, "errF.rds") ; saveRDS(errR, "errR.rds")
errF <- readRDS("errF.rds") ; errR <- readRDS("errR.rds")
# Plot error models
# Estimated error rates (black line) should fit with observed error
# rates (points), and error rates should decrease with higher quality score
#jpeg("error-f.jpg") ; plotErrors(errF, nominalQ = TRUE) ; dev.off()
#jpeg("error-r.jpg") ; plotErrors(errR, nominalQ = TRUE) ; dev.off()
# Dereplicate filtered sequences (all defaults)
derepFs <- derepFastq(filtFs, verbose = TRUE)
derepRs <- derepFastq(filtRs, verbose = TRUE)
names(derepFs) <- sample.names ; names(derepRs) <- sample.names #change names to sample names
saveRDS(derepFs, "derepFs.rds") ; saveRDS(derepRs, "derepRs.rds")
derepFs <- readRDS("derepFs.rds") ; derepRs <- readRDS("derepRs.rds")
# Increase band size for banded alignment heuristic
setDadaOpt(BAND_SIZE=32)
# Almost all defaults
dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
dadaRs <- dada(derepRs, err = errR, multithread = TRUE)
saveRDS(dadaFs, "dadaFs.rds") ; saveRDS(dadaRs, "dadaRs.rds")
dadaFs <- readRDS("dadaFs.rds") ; dadaRs <- readRDS("dadaRs.rds")
# Initial merge to compute overlap lengths (set maxMismatch to infinity)
mergers_w_rejects <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE, maxMismatch = Inf)
# Total overlap length is calculated as: mismatches in overlap region + matches in overlap region
merge_mismatches <- lapply(mergers_w_rejects, "[[", 6) # Extract count for all mismatches to compute total overlap length
merge_matches <- lapply(mergers_w_rejects, "[[", 5) # Extract count for all mismatches
# Finding a percentage cutoff
percentage <- function(x,y) {list(x/y*100)}
percent_mismatches <- mapply(percentage, merge_mismatches, merge_matches)
total_overlap_lengths <- Map("+", merge_mismatches, merge_matches) # Compute overlap length
mergers <- vector("list", length(mergers_w_rejects)) # Create empty vector to contain merged reads by sample
names(mergers) <- names(mergers_w_rejects) # Name samples
# Populate the empty vector with samples containing merged reads
for (i in 1:length(mergers)){
mergers[[i]] <- mergePairs(dadaFs[[i]], derepFs[[i]], dadaRs[[i]], derepRs[[i]], verbose=T,
maxMismatch= overlap_mismatch_proportion*total_overlap_lengths[[i]])}
saveRDS(mergers, "mergers.rds")
mergers <- readRDS("mergers.rds")
seqtab <- makeSequenceTable(mergers) ; dim(seqtab)
# Remove chimeras using consensus method
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus",
multithread=TRUE, verbose=TRUE)
saveRDS(seqtab.nochim, "seqtab_nochim.rds")
seqtab.nochim <- readRDS("seqtab_nochim.rds")
# Create a table of ASV length distributions
table(nchar(getSequences(seqtab.nochim)))
# Create a table showing how many sequences were retained after
# each step of the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers,
getN), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged",
"nonchim")
rownames(track) <- sample.names
track
write.table(track, "trackSequences.tsv", quote=F, sep="\t")
# Taxonomy using assignTaxonomy (RDP algorithm) with pre-specified database
# and confidence threshold from above
assignTax <- assignTaxonomy(seqtab.nochim, db_path, minBoot = taxonomy_confidence_threshold,
multithread=TRUE, tryRC=TRUE, outputBootstraps=TRUE)
# Creating fasta file, + merging count and taxonomy tables
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
# Loop to create headers for ASV fasta file (>ASV1, >ASV2, ...)
for (i in 1:dim(seqtab.nochim)[2])
{
asv_headers[i] <- paste(">ASV", i, sep="_")
}
# Making and writing out a fasta of our final ASV seqs:
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, "ASVs.fa")
# Count table: number of times ASV is observed/sample
asv_tab <- t(seqtab.nochim)
row.names(asv_tab) <- sub(">", "", asv_headers)
#write.table(asv_tab, "ASVs_counts.txt", sep="\t", quote=F)@
# Taxonomy tables: ASV # and corresponding taxonomy
asv_tax <- assignTax$tax
asv_tax_boot <- assignTax$boot
colnames(asv_tax) <- c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")
colnames(asv_tax_boot) <- c("domain.conf", "kingdom.conf", "phylum.conf", "class.conf", "order.conf", "family.conf", "genus.conf", "species.conf")
rownames(asv_tax) <- sub(">", "", asv_headers)
rownames(asv_tax_boot) <- sub(">", "", asv_headers)
# Merging taxonomy assignments and bootstrap values
tax_merged <- cbind(asv_tax, asv_tax_boot)
# Rearrange for viewing convenience
tax_merged_rearranged <-data.frame(tax_merged[,c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)])
# Merging ASV counts and new taxonomy table
final_table <- data.frame(cbind(asv_tab, tax_merged_rearranged))
head(final_table)
write.table(final_table, "asv_table.tsv", quote=F, sep="\t")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.