MButils: Utility functions for DNA metabarcoding

Documented in assignSpecies_mod

#' @title Return long-form species assignments with DADA2's assignSpecies
#'
#' @description This function is a modification of DADA2's \code{assignSpecies},
#'   which uses exact matching against a reference fasta to identify the
#'   genus-species binomial classification of the input sequences. Where the
#'   original function run with \code{allowMultiple = TRUE} returns a
#'   concatenated string of all exactly matched species, results are returned
#'   here as distinct rows, one per match.
#'
#' @usage assignSpecies_mod(seqs, refFasta, tryRC = FALSE, n = 2000)
#'
#' @param seqs A character vector of the sequences to be assigned
#' @param refFasta The path to the reference fasta file, or an R connection. Can
#'   be compressed. This reference fasta file should be formatted so that the ID
#'   lines correspond to the genus-species of the associated sequence:
#'
#'   >SeqID genus species ACGAATGTGAAGTAA......
#' @param tryRC (Optional). Default FALSE. If TRUE, the reverse-complement of
#'   each sequences will also be tested for exact matching to the reference
#'   sequences.
#' @param n (Optional). Default 2000. The number of sequences to perform
#'   assignment on at one time. This controls the peak memory requirement so
#'   that large numbers of sequences are supported.
#'
#' @import dada2
#'
#' @return A two-column data frame matching ASVs to their exact match in the
#'   reference, with multiple matches indicated by the presence of more than one
#'   row per ASV.
#'
#' @export
#'
#'

assignSpecies_mod <- function(seqs, refFasta, tryRC=FALSE, n=2000) {
  # Get character vector of sequences
  seqs <- getSequences(seqs)
  # Read in the reference fasta
  refsr <- ShortRead::readFasta(refFasta)
  species <- as(ShortRead::id(refsr), "character")
  # Crude format check
  if(!length(unlist(strsplit(species[[1]], "\\s"))) >= 3) {
    if(length(unlist(gregexpr(";", species[[1]]))) >= 3) {
      stop("Incorrect reference file format for assignSpecies (this looks like a file formatted for assignTaxonomy).")
    } else {
      stop("Incorrect reference file format for assignSpecies.")
    }
  }

  # Identify the exact hits
  hits <- vector("list", length(seqs))
  lens <- nchar(seqs)
  for(len in unique(lens)) { # Requires all same length sequences
    i.len <- which(lens==len) # Find indexes of ASVs equal to that length
    n.len <- length(i.len) # Number of ASVs equal to that length

    # This throttles the number of sequences that can be analyzed at any one time
    j.lo <- 1
    j.hi <- min(n,n.len)

    while(j.lo <= n.len) {
      i.loop <- i.len[j.lo:n.len] # Creates a list of indexes to loop over
      seqdict <- Biostrings::PDict(seqs[i.loop])
      vhit <- (Biostrings::vcountPDict(seqdict, ShortRead::sread(refsr))>0)
      if(tryRC) vhit <- vhit | (Biostrings::vcountPDict(seqdict, Biostrings::reverseComplement(ShortRead::sread(refsr)))>0)
      hits[i.loop] <- lapply(seq(nrow(vhit)), function(x) vhit[x,]) # A logical vector of which reference sequences were "hit"
      j.lo <- j.lo + n; j.hi <- min(j.hi+n, n.len)
      rm(seqdict)
      gc()
    }
  }

  # Get genus species return strings
  rval <- cbind(unlist(sapply(hits, mapHits_mod, refs=species)))
  colnames(rval) <- c("Species")
  rownames(rval) <- seqs
  # Reformat rval dataframe
  rval <-
    rval %>%
    as.data.frame() %>%
    rownames_to_column(var = 'asv') %>%
    tidyr::separate_rows(Species, sep = '\\/') # "Lengthen" data-- split /-separated column entries out to multiple rows

  gc()
  rval
}

# Helper function for assignSpecies_mod
mapHits_mod <- function(x, refs, sep="/") {
  hits <- refs[x]
  if(length(unique(hits))<=Inf) {
    rval <- do.call(paste, c(as.list(sort(unique(hits))), sep=sep))
  } else { rval <- NA_character_ }
  if(length(rval)==0) rval <- NA_character_
  rval
}