equinizeR: Manage Equine Marker Gene Data

#' Runs a custom DADA2 workflow
#' @param primerless_reads_directory The directory containing paired end reads with no primers
#' @param reference_taxonomy_database_path The path of the desired reference taxonomy database
#' @param overlap_mismatch_proportion A proportion for the number of allowed mismatches based on overlap length between paired reads
#' @param taxonomy_confidence_threshold A percentage for the confidence threshold (default 0) for taxonomic assignment
#' @export
run_dada2 <- function(primerless_reads_directory, reference_taxonomy_database_path,
                      overlap_mismatch_proportion = 0.015, taxonomy_confidence_threshold = 0) {
  path <- primerless_reads_directory
  db_path <- reference_taxonomy_database_path

  list.files(path)

  # Extracting directory + filenames of primerless sequences
  cutFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE))
  cutRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE))

  # Extract sample names (remove path & end of sequence name, "L001_R1_001.fastq")
  get.sample.name <- function(fname) strsplit(basename(fname), "_L001")[[1]][1]
  sample.names <- unname(sapply(cutFs, get.sample.name))

  # Plot forward and reverse quality profiles (get an idea of how our reads look)
  #jpeg('cutFs_quality_profile.jpg') ; plotQualityProfile(cutFs[1:2]) ; dev.off()
  #jpeg('cutRs_quality_profile.jpg') ; plotQualityProfile(cutRs[1:2]) ; dev.off() #Quality of reverse reads crashes quickly

  # Create a vector of file paths + names to where the filtered sequences will be stored
  filtFs <- file.path(path, "filtered", basename(cutFs))
  filtRs <- file.path(path, "filtered", basename(cutRs))
  saveRDS(filtFs, "filtFs.rds") ; saveRDS(filtRs, "filtRs.rds")
  filtFs <- readRDS("filtFs.rds") ; filtRs <- readRDS("filtRs.rds")

  # Conduct quality filtering
  # maxN: Set maximum number of ambiguous bases to 0 (should already be 0 from primer removal step)
  # maxEE: vector containing the maximum expected errors for the forward and reverse read of the sample
  # rm.phix: check for PhiX sequences and remove if found
  # compress: store filtered sequences in compressed format to save space
  # truncQ: truncate the remainder of a sequence at the first occurrence of quality score 2 or lower
  # minLen: reads should be at least 200 base pairs
  # 1e4: the number of reads to filter at a time, lowered so it can run on a laptop
  out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, maxN=0, maxEE=c(2,5),
                       rm.phix=T, compress=T, multithread=T, truncQ=2,minLen=100)
  saveRDS(out, "out.rds")
  out <- readRDS("out.rds")

  # Construct error models (mostly defaults)
  # MAX_CONSIST: the number of rounds in self-consistency loop
  #   to reach error convergence, increased from 10 to 15
  errF <- learnErrors(filtFs, multithread = TRUE, MAX_CONSIST=15)
  errR <- learnErrors(filtRs, multithread = TRUE, MAX_CONSIST=15)
  saveRDS(errF, "errF.rds") ; saveRDS(errR, "errR.rds")
  errF <- readRDS("errF.rds") ; errR <- readRDS("errR.rds")

  # Plot error models
  # Estimated error rates (black line) should fit with observed error
  #   rates (points), and error rates should decrease with higher quality score
  #jpeg("error-f.jpg") ; plotErrors(errF, nominalQ = TRUE) ; dev.off()
  #jpeg("error-r.jpg") ; plotErrors(errR, nominalQ = TRUE) ; dev.off()

  # Dereplicate filtered sequences (all defaults)
  derepFs <- derepFastq(filtFs, verbose = TRUE)
  derepRs <- derepFastq(filtRs, verbose = TRUE)
  names(derepFs) <- sample.names ; names(derepRs) <- sample.names #change names to sample names
  saveRDS(derepFs, "derepFs.rds") ; saveRDS(derepRs, "derepRs.rds")
  derepFs <- readRDS("derepFs.rds") ; derepRs <- readRDS("derepRs.rds")

  # Increase band size for banded alignment heuristic
  setDadaOpt(BAND_SIZE=32)
  # Almost all defaults
  dadaFs <- dada(derepFs, err = errF, multithread = TRUE)
  dadaRs <- dada(derepRs, err = errR, multithread = TRUE)
  saveRDS(dadaFs, "dadaFs.rds") ; saveRDS(dadaRs, "dadaRs.rds")
  dadaFs <- readRDS("dadaFs.rds") ; dadaRs <- readRDS("dadaRs.rds")

  # Initial merge to compute overlap lengths (set maxMismatch to infinity)
  mergers_w_rejects <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose=TRUE, maxMismatch = Inf)
  # Total overlap length is calculated as: mismatches in overlap region + matches in overlap region
  merge_mismatches <- lapply(mergers_w_rejects, "[[", 6) #  Extract count for all mismatches to compute total overlap length
  merge_matches <- lapply(mergers_w_rejects, "[[", 5) # Extract count for all mismatches

  # Finding a percentage cutoff
  percentage <- function(x,y) {list(x/y*100)}
  percent_mismatches <- mapply(percentage, merge_mismatches, merge_matches)

  total_overlap_lengths <- Map("+", merge_mismatches, merge_matches) # Compute overlap length
  mergers <- vector("list", length(mergers_w_rejects)) # Create empty vector to contain merged reads by sample
  names(mergers) <- names(mergers_w_rejects) #  Name samples
  # Populate the empty vector with samples containing merged reads
  for (i in 1:length(mergers)){
    mergers[[i]] <- mergePairs(dadaFs[[i]], derepFs[[i]], dadaRs[[i]], derepRs[[i]], verbose=T,
                               maxMismatch= overlap_mismatch_proportion*total_overlap_lengths[[i]])}
  saveRDS(mergers, "mergers.rds")
  mergers <- readRDS("mergers.rds")

  seqtab <- makeSequenceTable(mergers) ; dim(seqtab)

  # Remove chimeras using consensus method
  seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus",
                                      multithread=TRUE, verbose=TRUE)

  saveRDS(seqtab.nochim, "seqtab_nochim.rds")
  seqtab.nochim <- readRDS("seqtab_nochim.rds")

  # Create a table of ASV length distributions
  table(nchar(getSequences(seqtab.nochim)))

  # Create a table showing how many sequences were retained after
  # each step of the pipeline
  getN <- function(x) sum(getUniques(x))
  track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers,
                                                                         getN), rowSums(seqtab.nochim))
  colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged",
                       "nonchim")
  rownames(track) <- sample.names
  track
  write.table(track, "trackSequences.tsv", quote=F, sep="\t")

  # Taxonomy using assignTaxonomy (RDP algorithm) with pre-specified database
  # and confidence threshold from above
  assignTax <- assignTaxonomy(seqtab.nochim, db_path, minBoot = taxonomy_confidence_threshold,
                              multithread=TRUE, tryRC=TRUE, outputBootstraps=TRUE)

  # Creating fasta file, + merging count and taxonomy tables
  asv_seqs <- colnames(seqtab.nochim)
  asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")

  # Loop to create headers for ASV fasta file (>ASV1, >ASV2, ...)
  for (i in 1:dim(seqtab.nochim)[2])
  {
    asv_headers[i] <- paste(">ASV", i, sep="_")
  }

  #       Making and writing out a fasta of our final ASV seqs:
  asv_fasta <- c(rbind(asv_headers, asv_seqs))
  write(asv_fasta, "ASVs.fa")

  #       Count table: number of times ASV is observed/sample
  asv_tab <- t(seqtab.nochim)
  row.names(asv_tab) <- sub(">", "", asv_headers)
  #write.table(asv_tab, "ASVs_counts.txt", sep="\t", quote=F)@

  # Taxonomy tables: ASV # and corresponding taxonomy
  asv_tax <- assignTax$tax
  asv_tax_boot <- assignTax$boot
  colnames(asv_tax) <- c("domain", "kingdom", "phylum", "class", "order", "family", "genus", "species")
  colnames(asv_tax_boot) <- c("domain.conf", "kingdom.conf", "phylum.conf", "class.conf", "order.conf", "family.conf", "genus.conf", "species.conf")

  rownames(asv_tax) <- sub(">", "", asv_headers)
  rownames(asv_tax_boot) <- sub(">", "", asv_headers)

  # Merging taxonomy assignments and bootstrap values
  tax_merged <- cbind(asv_tax, asv_tax_boot)

  # Rearrange for viewing convenience
  tax_merged_rearranged <-data.frame(tax_merged[,c(1,9,2,10,3,11,4,12,5,13,6,14,7,15,8,16)])

  # Merging ASV counts and new taxonomy table
  final_table <- data.frame(cbind(asv_tab, tax_merged_rearranged))

  head(final_table)

  write.table(final_table, "asv_table.tsv", quote=F, sep="\t")
}