amplicR: An R package to process amplicon data

Documented in getRow getSeqFromDerep make.alleles ndada putna putzero replaceSNP sel.match subsetDerep where2trim write.nexus

#' Replace null values with zero or x with x - 1
#' 
#' Replace null values with zero or x with x - 1
#' 
#' This function is used internally by amplicR
#' 
#' @param x the element to evaluate
#' @return An obj of same class and length as the input
#' @export
putzero <- function(x){
  if(is.null(x)) x <- 0 else x <- x - 1
  return(x)
}

#' Replace null values with NA
#' 
#' Replace null values with NA
#' 
#' This function is used internally by amplicR
#' 
#' @param x the element to evaluate
#' @return An obj of same class and length as the input
#' @export
putna <- function(x){
  if(is.null(x)) {
    x <- NA 
  } else {
    if(length(x) == 0) x <- NA
  }
  return(x)
}

#' Select position where a pattern has been found
#'
#' Takes a set of sequences and the output from
#' \code{Biostrings::vmatchPattern()} and identify the first positions after the
#' match with a pattern
#' 
#' This function is used internally by amplicR
#' 
#' @param mismatch The number of allowed mismatches
#' @param subjectSet The set of sequences that were used for the search
#' @param patt_hits The output from Biostrings::vmatchPattern()
#' @param patt The patter used for the search
#' @param type The type of match, either forward "F", or reverse "R"
#' @return A list with start positions
#' @export
where2trim <- function(mismatch, subjectSet, patt_hits, patt, type="F") {
  if(length(patt_hits) == 0) return(list())
  
  if(mismatch == 0) {
    if(type == "F") {
      starts <- Biostrings::endIndex(patt_hits)
      starts <- lapply(starts, "[", 1)
      starts <- lapply(starts, putna)
    } else {
      starts <- Biostrings::startIndex(patt_hits)
      starts <- lapply(starts, putna)
      starts <- lapply(starts, max)
    }
    
  } else {
    
    sel_hits <- lapply(1:length(subjectSet), sel.match, 
                       subjectSet=subjectSet, 
                       patt_hits=patt_hits, 
                       patt=patt,
                       type=type)
    
    if(type == "F") {
      starts <- lapply(sel_hits, BiocGenerics::end)
    } else {
      starts <- lapply(sel_hits, BiocGenerics::start)
    }
    
    starts <- lapply(starts, putna)
  }
  
  return(starts) 
}

#' Select appropriate hits when searching a patter
#' 
#' This function is used internally by amplicR
#' 
#' @param i The index to subset patt_hitts
#' @inheritParams where2trim
#' @export
sel.match <- function(i, subjectSet, patt_hits, patt, type="F") {
  if (!requireNamespace("ShortRead", quietly = TRUE)) {
    stop("Package 'ShortRead' needed for this function to work. Please install it 
         either manually or using the function amplicR::setup().",
         call. = FALSE)
  }
  if(length(patt_hits[[i]]) > 1) {
    if(type == "F") {
      patt <- Biostrings::DNAString(patt)
    } else {
      patt <- Biostrings::reverseComplement(Biostrings::DNAString(patt))
    }
    
    v <- IRanges::Views(subjectSet[[i]], patt_hits[[i]])
    hits <- suppressWarnings(as.character(v, use.names=FALSE, check.limits=FALSE))
    sr <- ShortRead::srdistance(Biostrings::DNAStringSet(hits), patt)
    nDiff2 <- unlist(sr)
    nDiff <- unlist(sr, use.names=FALSE)
    if(type == "F") {
      m <- patt_hits[[i]][which.min(nDiff)]
    } else {
      if(length(nDiff) == 0) {
        m <- patt_hits[[i]][integer()]
      } else {
        m <- patt_hits[[i]][max(which(nDiff == min(nDiff)))]
      }
    }
  } else {
    return(patt_hits[[i]])
  }
  
  return(m)
}


#' Return the number of filtered sequences from a derep obj
#' 
#' Return the number of filtered sequences from a derep obj
#' 
#' This function is used internally by amplicR
#' 
#' @param derep 
#' @return A vector
#' @export
getnFiltered <- function (derep) {
  nFiltered <- length(derep$map)
  return(nFiltered)
}

#' Return the number of unique sequences in a derep obj
#' 
#' Return the number of unique sequences in a derep obj
#' 
#' This function is used internally by amplicR
#' 
#' @param derep 
#' @return A vector
#' @export
getnUniques <- function (derep) {
  nUniques <- length(derep$uniques)
  return(nUniques)
}

#' Return the number of unique sequences in a dada obj
#' 
#' Return the number of unique sequences in a dada obj
#' 
#' This function is used internally by amplicR
#' 
#' @param dada_el the element of a dada object
#' @return A vector
#' @export
ndada <- function(dada_el) {
  nSeqs <- length(dada2::getUniques(dada_el))
  return(nSeqs)
}

#' Extracts rows from a matrix into a vector
#' 
#' The row rn becomes an element of a vector. This function is used internally by amplicR
#' @param m A matrix
#' @param rn The row number to extract
#' @return A vector of same type as the matrix
#' @export
#' 
getRow <- function(m, rn) {
  r <- as.integer(round(m[rn,], 0))
  qual <- Biostrings::quality(NumericQuality(r))
  qual <- IlluminaQuality(qual)
  return(qual)
}

#' Write a nexus file from a DNAStringSet object
#'
#' Takes a \code{character} vector or a \code{DNAStringSet} and write a nexus
#' file to disk.
#'
#' If \code{x} is a \code{character} vector, it will convert it to a
#' \code{DNAStringSet}. if \code{aln=FALSE}, \code{write.nexus} will align the
#' sequences first and then write the nexus file. If \code{aln=TRUE}, it will
#' assume that all sequences have the same length and are aligned.
#'
#' If \code{charset=TRUE}, \code{write.nexus} will append a character block at
#' the end of the file. This is sometimes used to partition the alignment when
#' it is imported in software like BEAST2
#'
#' @param x Either a \code{character} vector or a \code{DNAStringSet}
#' @param aln Logical. Whether x is an alignment
#' @param gapOpening Integer (negative). Penalty for opening a gap in the
#'   alignment
#' @param gapExtension Integer (negative). Penalty for extending a gap in the
#'   alignment
#' @param dir.out The path where to save the output
#' @param fn The output file name
#' @param charset Whether a charset block should be added at the end of the
#'   nexus file
#' @param locusIDs \code{character} vector with the names of the loci (if
#'   \code{charset=TRUE}) in the same order as they appear in the sequences
#' @param locusLength Integer vector with the length of each locus (if
#'   \code{charset=TRUE}) in the same order as they appear in the sequences
#'   @param verbose Whether print info on progress
#' @return Writes a nexus file to disk
#' @export
#' @examples seqs <- c("AAATTTT", "GAATTCT")
#'         names(seqs) <- c("seq1", "seq2")
#'         x <- Biostrings::DNAStringSet(seqs)
#'         locusNames <- c("locus1", "locus2")
#'         locusLen <- c(3, 4)
#'         tmpDir <- tempdir(check = TRUE)
#'         write.nexus(x, aln=TRUE, dir.out=tmpDir, fn="aln.nex", charset=TRUE, 
#'                 locusIDs=locusNames, locusLength=locusLen)

write.nexus <- function(x, aln=FALSE, gapOpening=c(-18, -16), gapExtension=c(-2, -1), 
                        dir.out, fn, charset=FALSE, locusIDs, locusLength, verbose=FALSE) {
  #----------------------------------------------------------------------------#
  if (!requireNamespace("DECIPHER", quietly = TRUE)) {
    stop("Package 'DECIPHER' needed for this function to work. Please install it 
         either manually or using the function amplicR::setup().",
         call. = FALSE)
  }  
  #----------------------------------------------------------------------------#
  
  whatsx <- class(x)
  if(whatsx == "character") {
    x <- Biostrings::DNAStringSet(x)
    names(x) <- paste0("seq", seq_along(x))
    }
  if(whatsx == "DNAStringSet" & aln == FALSE) {
    x <- DECIPHER::AlignSeqs(x, gapOpening=gapOpening, gapExtension=gapExtension) 
  } else {
    if(whatsx == "DNAStringSet" & aln == TRUE & verbose == TRUE) 
      message("'aln' is set to TRUE, assuming 'x' is an alignment")
  }
  
  nex <- c("#NEXUS", 
           "begin taxa;",
           paste0("dimensions ntax=", length(x), ";"),
           "taxlabels", make.names(names(x), unique = TRUE), ";",
           "End;",
           "begin characters;",
           paste0("dimensions nchar=", IRanges::width(x)[1], ";"),
           "Format datatype=dna missing=? gap=- matchchar=.;",
           "Matrix",
           paste(make.names(names(x), unique = TRUE), x),
           ";",
           "End;")
 
  charsets <- NULL
  if(charset == TRUE) {
    pos <- 0
    charsets <- character(length(locusIDs))
    for(i in seq_along(charsets)) {
      charsets[i] <- paste("charset", locusIDs[i], "=", 1 + pos, "-", 
                           pos + locusLength[i], ";")
      pos <- pos + locusLength[i]
      charblock <- c("begin assumptions;", charsets, "end;")
    }
  }
  writeLines(c(nex, if(charset == TRUE) charblock), con=file.path(dir.out, fn))
}

#' Extract sequences from a derep object
#' 
#' This function is used internally to batch-extract sequences from a derep object
#' 
#' @param derepObj The derep-element (that is one element from the list that constitutes the derep list)
#' @return A character vector with the sequences
#' @export
#' 
getSeqFromDerep <- function(derepObj) {
  reads <- list(sequence=names(derepObj$uniques))
  return(reads)
}

#' Subset a derep-class based on abundance of the reads
#'
#' This function retains the reads that have their abundance >= \code{minAbund}.
#' It returns an object similar to a derep-class in its structure. Indeed it has
#' a \code{$uniques} and \code{quals} elemnt for the retained sequences, but,
#' rather than having \code{map} as third element in the list, it has
#' \code{rmed}, which is a vector with the index of the positions of the
#' \code{uniques} from the original object that were removed.
#' 
#' @param derep The derep object to subset
#' @param minAbund The threshold below which reads are removed
#' @return A list where the first two elements are as a derep-class, and the third is rmed. See details.
#' @export
subsetDerep <- function(derep, minAbund=2){
  sel <- derep$uniques >= minAbund
  uniques <- derep$uniques[sel]
  quals <- derep$quals[sel, ] 
  rmed <- seq_along(sel)[sel]
  return(list(uniques=uniques, quals=quals, rmed=rmed))
}

#' Build (possible) allele sequences based on genotypes
#'
#' Given the sequence of the reference allele, the genotypes, the SNPs (as the
#' substitute nucleotide) and their positions, this function builds the possible
#' allele sequences.
#'
#' The genotypes are to be codified as 0 for the reference allele, 2 for the
#' alternative allele, and 1 for the hetorozygous. The genotype vector has to be
#' a named integer vector where the names follow the convention.
#' "something-p-nb/na" something is usually the name of the locus, p is the
#' position of the SNP (as integer), nb is the nucleotide in the base allele and
#' na is the nucleotide of the alternative allele. For examples:
#' "100614668-2-A/C". Of all these, the critical elements are that nb and na has
#' to be in the second to last and penultimate position in the string.
#' @param baseAllele (Character) The sequence of the reference allele
#' @param genotypes (named integer) The genotypes (0,1,2) named with SNPs (see
#'   Details)
#' @param SNPpositions (integer) The position where the SNPs occur
#' @param lenAllele (integer) The length of the allele (as number of
#'   nucleotides)
#' @return A character vector with all the possible allele sequences
#' @export
#' @examples
#' SNPpositions <- list(
#' c(0,2,7),
#' c(0,1,2),
#' c(0,1,3),
#' c(1,2,7),
#' c(1,2,3),
#' c(2,4,6),
#' c(5,6,7)
#' )
#'
#'
#' baseAllele <- "AAAAAAAA"
#' genotypes <- c(2,1,1)
#' names(genotypes) <- paste0("something-p-", c("A/G", "A/C", "A/T"))
#' seqAlleles <- lapply(SNPpositions, make.alleles, baseAllele=baseAllele,
#'                      genotypes=genotypes,lenAllele=8)
#' seqAlleles                      
make.alleles <- function(baseAllele, genotypes, SNPpositions, lenAllele) {
  IUPAC <- c("AC", "AG", "AT", "CG", "CT", "GT", "CA", "GA", "TA", "GC", "TC", "TG")
  names(IUPAC) <- c("M", "R", "W", "S", "Y", "K", "M", "R", "W", "S", "Y", "K")
  # remember that SNP position are one behind because position 1 is indexed as 0
  breaks <- c(SNPpositions, 
              # if the last SNP position is at the end of the allele sequence, 
              # this needs to be dropped as it is already being taken care of by the next line
              if((max(SNPpositions) + 1) == lenAllele) {
                head(SNPpositions, -1) + 1
              } else {
                SNPpositions + 1  
              }
              , lenAllele)
  breaks <- sort(breaks)
  breaks <- breaks[!duplicated(breaks)] # if SNP are consecutive
  SNP1stPos <- which(SNPpositions == 0) # Check if one SNP is at the start
  if(length(SNP1stPos) == 1) breaks <- breaks[-which(breaks == 0)] # if so rm 0
  nsections <- length(breaks) 
  sections <- vector("list", length=nsections)
  seqAlleles <- ""
  s <- 1
  #section <- 1
  for(section in (seq_len(nsections))) {
    sections[[section]] <- substr(baseAllele, start=s, stop=breaks[section])
    if(breaks[section] %in% (SNPpositions + 1)) {
      whichGen <- which(SNPpositions + 1 == breaks[section])
      baseSNP <- substr(names(genotypes)[whichGen], 
                        start=nchar(names(genotypes)[whichGen]) - 2, 
                        stop=nchar(names(genotypes)[whichGen]) - 2)
      altSNP <- substr(names(genotypes)[whichGen], 
                       start=nchar(names(genotypes)[whichGen]), 
                       stop=nchar(names(genotypes)[whichGen]))
      if(is.na(genotypes[whichGen])) {
        sections[[section]] <- names(which(IUPAC == 
                                             paste(c(baseSNP, altSNP), collapse = "")))
      } else {
        if(genotypes[whichGen] == 2) {
          sections[[section]] <- altSNP
        } else {
          if(genotypes[whichGen] == 0) {
            sections[[section]] <- baseSNP
          } else {
            sections[[section]] <- c(baseSNP, altSNP)
          }
        }
      }
    }
    seqAlleles <- unlist(lapply(seqAlleles, paste0, sections[[section]]))
    s <- breaks[section] + 1
    #section <- section + 1
  }
  return(seqAlleles)
}


#' Replace SNPs with IUPAC ambiguities
#' 
#' Replace SNPs with IUPAC ambiguities
#' 
#' See \code{make.alleles} for details on how \code{genotypes} should be formatted
#' 
#' @param seqAlleles possible sequences of the alleles or baseAllele (see \code{make.alleles} for details)
#' @inheritParams make.alleles
# Move function at the top
replaceSNP <- function(genotypes, seqAlleles, SNPpositions) {
  IUPAC <- c("AC", "AG", "AT", "CG", "CT", "GT", "CA", "GA", "TA", "GC", "TC", "TG")
  names(IUPAC) <- c("M", "R", "W", "S", "Y", "K", "M", "R", "W", "S", "Y", "K")
  seq <- seqAlleles[1]
  for(whichGen in seq_along(genotypes)) {
    baseSNP <- substr(names(genotypes)[whichGen], 
                      start=nchar(names(genotypes)[whichGen]) - 2, 
                      stop=nchar(names(genotypes)[whichGen]) - 2)
    altSNP <- substr(names(genotypes)[whichGen], 
                     start=nchar(names(genotypes)[whichGen]), 
                     stop=nchar(names(genotypes)[whichGen]))
    substr(seq, start=SNPpositions[whichGen] + 1, stop=SNPpositions[whichGen] + 1) <-
      names(which(IUPAC == paste(c(baseSNP, altSNP), collapse = "")))
  }
  return(seq)
}
carlopacioni/amplicR documentation built on Aug. 19, 2023, 7:59 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
carlopacioni/amplicR
An R package to process amplicon data

R/helper.functions.R
In carlopacioni/amplicR: An R package to process amplicon data

Defines functions replaceSNP make.alleles subsetDerep getSeqFromDerep write.nexus getRow ndada sel.match where2trim putna putzero

Documented in getRow getSeqFromDerep make.alleles ndada putna putzero replaceSNP sel.match subsetDerep where2trim write.nexus

R Package Documentation

Browse R Packages

We want your feedback!

carlopacioni/amplicR An R package to process amplicon data

R/helper.functions.R In carlopacioni/amplicR: An R package to process amplicon data

Defines functions replaceSNP make.alleles subsetDerep getSeqFromDerep write.nexus getRow ndada sel.match where2trim putna putzero

Documented in getRow getSeqFromDerep make.alleles ndada putna putzero replaceSNP sel.match subsetDerep where2trim write.nexus

R Package Documentation

Browse R Packages

We want your feedback!

carlopacioni/amplicR
An R package to process amplicon data

R/helper.functions.R
In carlopacioni/amplicR: An R package to process amplicon data