equinizeR: Manage Equine Marker Gene Data

#' Remove primers using cutadapt as per the DADA2 ITS workflow
#' @param sequence_directory directory containing paired-end reads
#' @param cutadapt path of cutadapt on your machine
#' @export
cutadaptPrimers <- function(sequence_directory, cutadapt){
  path <- sequence_directory

  list.files(path)

  fnFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE)) #Create character vector of locations + names of forward seqs
  fnRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE)) #and same for reverse

  # NC1 and NC2 primers
  FWD <- "ACGTCTGGTTCAGGGTTGTT"
  REV <- "TTAGTTTCTTTTCCTCCGCT"

  # Function that returns all orientations (forward, reverse, their complements and reverse complements) of primers
  allOrients <- function(primer) {
    require(Biostrings)
    dna <- DNAString(primer)
    orients <- c(Forward = dna, Complement = complement(dna),
                 Reverse = reverse(dna), RevComp = reverseComplement(dna))
    return(sapply(orients, toString))
  }

  # Get all of our orientation
  FWD.orients <- allOrients(FWD)
  REV.orients <- allOrients(REV)

  # Specify location + name of reads filtered for ambiguous bases (Ns)
  fnFs.filtN <- file.path(path, "filtN", basename(fnFs))
  fnRs.filtN <- file.path(path, "filtN", basename(fnRs))

  # Filter out all sequences containing Ns
  filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN=0, compress = FALSE, multithread = TRUE)

  #Function to count the number of primer hits
  primerHits <- function(primer, fn) {
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
    return(sum(nhits > 0))
  }


  rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]),
        FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]),
        REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]),
        REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))
  #For some reason cutadapt is not detecting our reverse primer...


  system(paste(cutadapt, "--version")) #  ensure that cutadapt is working

  path.cut <- file.path(path, "cutadapt") # This will be the directory containing reads with primers removed
  if(!dir.exists(path.cut)) dir.create(path.cut) #  Make the directory
  fnFs.cut <- file.path(path.cut, basename(fnFs)) # Specify directory + names of reads with primers removed
  fnRs.cut <- file.path(path.cut, basename(fnRs))

  FWD.RC <- dada2:::rc(FWD)
  REV.RC <- dada2:::rc(REV)

  # Specify cutadapt parameters
  # g stands for front of the read,
  # a stands for the adaptor (primer) to remove
  # capitalized parameters correspond to the R2 read
  R1.flags <- paste("-g", FWD, "-a", REV.RC)
  R2.flags <- paste("-G", REV, "-A", FWD.RC)

  # Run cutadapt from r
  # n is the number of rounds to search and remove a primer,
  #   set to 5 in case primers appear more than once
  # o refers to output name
  # and p indicates that outputs consist of paired-end reads
  for (i in seq_along(fnFs)) {system(paste(cutadapt, R1.flags, R2.flags, "-n", 5,
                                           "-o", fnFs.cut[i], "-p", fnRs.cut[i],
                                           fnFs.filtN[i], fnRs.filtN[i]))}

  #Now count primer hits in our sequences, should all be 0
  rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
        FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
        REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
        REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
}