R/misc.R

Defines functions bs1ham samdf_to_qiime2 seqtab_to_mothur seqtab_to_qiime is.list.of pfasta checkConvergence rc strdiff nwextract nwalign getNseq getAbund getSequences getUniques

Documented in getSequences getUniques nwalign rc

################################################################################
#' Get the uniques-vector from the input object.
#' 
#' This function extracts the \code{\link{uniques-vector}} from several different data objects, 
#'  including \code{\link{dada-class}} and \code{\link{derep-class}} objects, as well as 
#'  \code{data.frame} objects that have both $sequence and $abundance columns.
#'  The return value is an integer vector named by sequence and valued by abundance. If the input is
#'  already in \code{\link{uniques-vector}} format, that same vector will be returned.
#' 
#' @param object (Required). The object from which to extract the \code{\link{uniques-vector}}.
#' 
#' @param collapse (Optional). Default TRUE.
#'  Should duplicate sequences detected in \code{object} be collapsed together, thereby
#'   imposing uniqueness on non-unique input.
#'  
#' @param silence (Optional). Default FALSE.
#'  Suppress reporting of the detection and merger of duplicated input sequences.
#' 
#' @return \code{integer}.
#'  An integer vector named by unique sequence and valued by abundance.
#' 
#' @importFrom methods is
#' 
#' @export
#' 
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, err=tperr1)
#' getUniques(derep1)[1:3]
#' getUniques(dada1)[1:3]
#' getUniques(dada1$clustering)[1:3]
#' 
getUniques <- function(object, collapse=TRUE, silence=FALSE) {
  if(is.character(object) && length(object)==1 && file.exists(object)) {
    unqs <- derepFastq(object)$uniques
  } else if(is.vector(object, "integer") && length(names(object)) != 0 && !any(is.na(names(object)))) { # Named integer vector already
    unqs <- object
  } else if(is(object, "dada")) {  # dada return 
    unqs <- object$denoised
  } else if(is(object, "derep")) {
    unqs <- object$uniques
  } else if(is.data.frame(object) && all(c("sequence", "abundance") %in% colnames(object))) {
    unqs <- as.integer(object$abundance)
    names(unqs) <- object$sequence
  } else if(is.matrix(object) && is.numeric(object) && !any(is.na(colnames(object)))) { # Tabled sequences
    unqs <- as.integer(colSums(object))
    names(unqs) <- colnames(object)
  }
  else {
    stop("Unrecognized format: Requires named integer vector, fastq filename, dada-class, derep-class, sequence matrix, or a data.frame with $sequence and $abundance columns.")
  }
  #### ENFORCE UNIQUENESS HERE!!!
  if(any(duplicated(names(unqs)))) {
    if(collapse) {
      unqs <- tapply(unqs, names(unqs), sum)
      if(!silence) message("Duplicate sequences detected and merged.")
    } else if(!silence) {
      message("Duplicate sequences detected.")
    }
  }
  return(unqs)
}

################################################################################
#' Get vector of sequences from input object.
#' 
#' This function extracts the sequences from several different data objects, including
#'  including \code{\link{dada-class}} and \code{\link{derep-class}} objects, as well as 
#'  \code{data.frame} objects that have both $sequence and $abundance columns. This function 
#'  wraps the \code{\link{getUniques}} function, but return only the names (i.e. the sequences).
#'  Can also be provided the file path to a fasta or fastq file, a taxonomy table, or a
#'  DNAStringSet object. Sequences are coerced to upper-case characters.
#' 
#' @param object (Required). The object from which to extract the sequences.
#' 
#' @param collapse (Optional). Default FALSE.
#'  Should duplicate sequences detected in \code{object} be collapsed together, thereby
#'  imposing uniqueness on non-unique input.
#'  
#' @param silence (Optional). Default TRUE.
#'  Suppress reporting of the detection and merger of duplicated input sequences.
#' 
#' @return \code{character}. A character vector of the sequences.
#' 
#' @importFrom methods is
#' @importFrom methods as
#' @importFrom ShortRead readFasta
#' @importFrom ShortRead readFastq
#' @importFrom ShortRead sread
#' @importFrom ShortRead id
#' 
#' @export
#' 
#' @examples
#' derep1 = derepFastq(system.file("extdata", "sam1F.fastq.gz", package="dada2"))
#' dada1 <- dada(derep1, err=tperr1)
#' getSequences(derep1)[1:5]
#' getSequences(dada1)[1:5]
#' getSequences(dada1$clustering)[1:5]
#' 
getSequences <- function(object, collapse=FALSE, silence=TRUE) {
  if(is(object, "character")) {
    if(length(object)==1 && file.exists(object)) {
      sr <- tryCatch(readFasta(object), error=function(err) { readFastq(object) })
      seqs <- toupper(as.character(sread(sr)))
      names(seqs) <- id(sr)
      rval <- seqs
    } else if(collapse) {
      if(any(duplicated(object)) && !silence) message("Duplicate sequences detected and merged.")
      rval <- unique(object)
    } else {
      rval <- object
    }
  } else if(is(object, "DNAStringSet")) {
    rval <- as.character(object)
  } else if(is.matrix(object) && is.character(object) && !any(is.na(rownames(object)))) { # Taxonomy table
    seqs <- rownames(object)
    if(any(duplicated(seqs))) {
      if(collapse) seqs <- unique(seqs)
      if(collapse && !silence) message("Duplicate sequences detected and merged.")
      if(!collapse && !silence) message("Duplicate sequences detected.")
    }
    rval <- seqs
  } else {
    rval <- names(getUniques(object, collapse=collapse, silence=silence))
  }
  return(toupper(rval))
}

getAbund <- function(object) {
  return(sum(getUniques(object)))
}

getNseq <- function(object) {
  return(length(getUniques(object)))
}

################################################################################
#' Needleman-Wunsch alignment.
#' 
#' This function performs a Needleman-Wunsch alignment between two sequences.
#' 
#' @param s1 (Required). \code{character(1)}. The first sequence to align. A/C/G/T only.
#' 
#' @param s2 (Required). \code{character(1)}. The second sequence to align. A/C/G/T only.
#' 
#' @param match (Optional). \code{numeric(1)}. Default is getDadaOpt("MATCH").
#'  The score of a match in the alignment.
#' 
#' @param mismatch (Optional). \code{numeric(1)}. Default is getDadaOpt("MISMATCH").
#'  The score of a mismatch in the alignment.
#' 
#' @param gap (Optional). \code{numeric(1)}. Default is getDadaOpt("GAP_PENALTY").
#'  The alignment gap penalty. Should be negative.
#'  
#' @param homo_gap (Optional). \code{numeric(1)}. Default NULL (no special homopolymer penalty).
#'  The alignment gap penalty within homopolymer regions. Should be negative.
#'  
#' @param band (Optional). \code{numeric(1)}.  Default -1 (no banding).
#'  The Needleman-Wunsch alignment can be banded. This value specifies the radius of that band.
#'  Set \code{band = -1} to turn off banding.
#'  
#' @param endsfree (Optional). \code{logical(1)}. Default TRUE.
#'  Allow unpenalized gaps at the ends of the alignment.
#'  
#' @param vec (Optional). \code{logical(1)}. Default FALSE.
#'  Use DADA2's vectorized aligner instead of standard DP matrix. Not intended for long sequences (>1kb).
#'  
#' @return \code{character(2)}. The aligned sequences.
#' 
#' @export
#' 
#' @examples
#'  sq1 <- "CTAATACATGCAAGTCGAGCGAGTCTGCCTTGAAGATCGGAGTGCTTGCACTCTGTGAAACAAGATA"
#'  sq2 <- "TTAACACATGCAAGTCGAACGGAAAGGCCAGTGCTTGCACTGGTACTCGAGTGGCGAACGGGTGAGT"
#'  nwalign(sq1, sq2)
#'  nwalign(sq1, sq2, band=16)
#' 
nwalign <- function(s1, s2, match=getDadaOpt("MATCH"), mismatch=getDadaOpt("MISMATCH"), gap=getDadaOpt("GAP_PENALTY"), homo_gap=NULL, band=-1, endsfree=TRUE, vec=FALSE) {
  if(!is.character(s1) || !is.character(s2)) stop("Can only align character sequences.")
  if(is.null(homo_gap)) { homo_gap <- gap }
  if(vec) {
    if(homo_gap != gap) stop("Homopolymer gap penalties are not implemented in the vectorized aligner.")
    return(C_nwvec(s1, s2, match, mismatch, gap, band, endsfree))
  } else {
    if(!C_isACGT(s1) || !C_isACGT(s2)) {
      stop("Sequences must contain only A/C/G/T characters.")
    }
    return(C_nwalign(s1, s2, match, mismatch, gap, homo_gap, band, endsfree))
  }
}

################################################################################
#' Hamming distance after Needlman-Wunsch alignment.
#' 
#' This function performs a Needleman-Wunsch alignment between two sequences, and then counts
#' the number of mismatches and indels in that alignment. End gaps are not included in this count.
#' 
#' @param s1 (Required). \code{character(1)}. The first sequence to align. A/C/G/T only.
#' 
#' @param s2 (Required). \code{character(1)}. The second sequence to align. A/C/G/T only.
#' 
#' @param ... (Optional). Further arguments to pass on to \code{\link{nwalign}}.
#' 
#' @return \code{integer(1)}. The total number of mismatches and gaps, excluding gaps at the beginning
#'  and end of the alignment.
#' 
#' @export
#' 
#' @examples
#'  sq1 <- "CTAATACATGCAAGTCGAGCGAGTCTGCCTTGAAGATCGGAGTGCTTGCACTCTGTGAAACAAGATA"
#'  sq2 <- "TTAACACATGCAAGTCGAACGGAAAGGCCAGTGCTTGCACTGGTACTCGAGTGGCGAACGGGTGAGT"
#' nwhamming(sq1, sq2)
#' nwhamming(sq1, sq2, band=16)
#' 
nwhamming <- Vectorize(function(s1, s2, ...) {
  al <- nwalign(s1, s2, ...)
  out <- C_eval_pair(al[1], al[2])
  return(unname(out["mismatch"]+out["indel"]))
}, USE.NAMES=FALSE)

nweval <- Vectorize(function(s1, s2, ...) {
  al <- nwalign(s1, s2, ...)
  C_eval_pair(al[1], al[2])
}, USE.NAMES=FALSE)

nwextract <- function(query, ref, ...) {
  al <- nwalign(query, ref, ...)
  ntq <- gregexpr("[ACGT]", al[[1]])
  rval <- substr(al[[2]], min(ntq[[1]]), max(ntq[[1]]))
  rval <- gsub("-", "", rval)
  rval
}

strdiff <- function(s1, s2) {
  xx = unlist(strsplit(s1,""))
  yy = unlist(strsplit(s2,""))
  dd <- which(xx != yy)
  data.frame(pos=dd,nt0=xx[dd],nt1=yy[dd])
}

################################################################################
#' Reverse complement DNA sequences.
#' 
#' This function reverse complements DNA sequence(s) provided.
#' This function is nothing more than a concisely-named convenience wrapper for 
#' \code{\link[Biostrings]{reverseComplement}} that handles the \code{character} vector
#' DNA sequences generated in the the dada2 package.
#' 
#' @param sq (Required). \code{character}. The DNA sequence(s) to reverse-complement.
#'  \code{\link[Biostrings]{DNAString}}, or \code{\link[Biostrings]{DNAStringSet}} formats
#'  are also accepted.
#'  
#' @return \code{character}. The reverse-complemented DNA sequence(s).
#' 
#' @seealso 
#'  \code{\link[Biostrings]{reverseComplement}}
#'  
#' @importFrom Biostrings DNAString
#' @importFrom Biostrings DNAStringSet
#' @importFrom Biostrings reverseComplement
#' @importFrom methods as
#' 
#' @export
#' 
#' @examples
#' R1492 <- "RGYTACCTTGTTACGACTT"
#' rc(R1492)
#' sqs <- getSequences(system.file("extdata", "example_seqs.fa", package="dada2"))
#' rc(sqs)
#' 
rc <- function(sq) {
  if(length(sq) < 1) {
    return(character(0))
  } else if(length(sq) == 1) {
    as(reverseComplement(DNAString(sq)), "character")
  } else {
    as(reverseComplement(DNAStringSet(sq)), "character")
  }
}

checkConvergence <- function(dadaO) {
  sapply(dadaO$err_in, function(x) sum(abs(dadaO$err_out-x)))
}

pfasta <- function(seqs, ids=seq(length(seqs))) {
  seqs <- getSequences(seqs, collapse=FALSE)
  cat(paste(">", ids, "\n", seqs, sep="", collapse="\n"))
}

#' @importFrom methods is
#' @keywords internal
is.list.of <- function(x, ctype) {
  if(!is.list(x)) return(FALSE)
  else return(all(sapply(x, is, ctype)))
}

#' @importFrom utils write.table
#' @keywords internal
seqtab_to_qiime <- function(st, fout) {
  st <- t(st) # QIIME has OTUs as rows
  col.names <- colnames(st)
  col.names[[1]] <- paste0("#OTU ID\t", col.names[[1]])
  write.table(st, fout, sep="\t",
              row.names=TRUE, col.names=col.names, quote=FALSE)
}

#' @importFrom utils write.table
#' @keywords internal
seqtab_to_mothur <- function(st, fout) {
  # mothur has OTUs as columns, and a couple required columns
  df.shared <- data.frame(label=rep("DADA2", nrow(st)), Group=rownames(st), numOtus=ncol(st))
  df.shared <- cbind(df.shared, st)
  write.table(df.shared, fout, row.names=FALSE, col.names=TRUE, quote=FALSE)
}

#' @importFrom utils write.table
#' @keywords internal
samdf_to_qiime2 <- function(df, fout) {
  col.names <- colnames(df)
  col.names[[1]] <- paste0("#SampleID\t", col.names[[1]])
  write.table(df, fout, sep="\t",
              row.names=TRUE, col.names=col.names, quote=FALSE)
}

#' @keywords internal
bs1ham <- function(dd, ham=1) {
  is.1ham <- which(dd$clustering$birth_ham %in% ham)
  dd$birth_subs[dd$birth_subs$clust %in% is.1ham,]
}

#' @keywords internal
getSRR <- Vectorize(function(run, outdir="sra", verbose=TRUE, ...) {
  if(!grepl("^SRR[0-9]{6,}$", run)) stop("Requires SRA Run accessions in format: SRR1234567")
  if(!dir.exists(outdir)) dir.create(outdir)
  loc <- paste0("ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/SRR/",
                substr(run, 1, 6), "/", run, "/")
  loc <- paste0(loc, run, ".sra")
  download.file(loc, file.path(outdir, paste0(run, ".sra")), ...)
  if(verbose) cat(run, "\n")
})

Try the dada2 package in your browser

Any scripts or data that you put into this service are public.

dada2 documentation built on Nov. 8, 2020, 6:48 p.m.