R/helper_functions.R

Defines functions subset_model pad_alignment get_subalignment

Documented in get_subalignment pad_alignment subset_model

# Alignment utilities -----------------------------------------------------

#' Get primer binding position
#'
#' @param primer A character string, DNAbin object or an object coercible to DNAbin
#' @param model A profile hidden Markov model (a "PHMM" object) generated by the aphid R package to align the sequences to.
#' @param tryrc Whether the reverse complement should also be aligned. The highest scoring complement is chosen.
#' @param quiet Whether progress should be printed to the console.
#' @param min_score Minimum score for the viterbi alignment.
#' @param ... aditional arguments to be passed to "Viterbi"
#'
#' @return
#' @export
#' @import stringr
#' @importFrom ape complement
#' @importFrom aphid Viterbi
#'
get_binding_position <- function (primer, model, tryrc = TRUE, quiet = FALSE, min_score = 10, ...) {

  input <- primer
  if (!inherits(model, "PHMM")) { stop("Error: model must be a PHMM object")}

  if (!is.null(primer)) {
    if (!inherits(primer, "DNAbin")) {
      if (mode(primer) == "character") {
        if (nchar(primer[1]) == 1) {primer <- paste0(primer, collapse = "")}
        if(stringr::str_detect(primer, "I")) {message(paste0("Warning: Inosine (I) bases detected in primer ", input," these will be converted to N!"))}
        primer <- char2DNAbin(primer)
      }
      else {
        if (!inherits(primer, "PHMM"))
          stop("Invalid primer(s)\n")
      }
    }
  }

  up <- primer[!primer %in% as.raw(c(2, 4))]
  vitF <- aphid::Viterbi(model, up, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)

  # Calculate longest match length
  rle_path <- rle(vitF$path)
  rle_matches <- rle_path$lengths
  names(rle_matches) <- rle_path$values
  all_matches <- rle_matches[names(rle_matches) == "1"]

  # Check for primers split into multiple matches
  if(length(all_matches) > 1){
    longest_match <- max(all_matches)

    if(which.max(all_matches) == 1){
      up[[1]] <- up[[1]][1:max(all_matches)]
    } else {
      up[[1]] <- up[[1]][all_matches[which.max(all_matches)-1]:all_matches[which.max(all_matches)]]
    }
    # Redo the viterbi alignment with just the subset primer
    vitF <- aphid::Viterbi(model, up, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)
  }

  if (tryrc == TRUE) {
    down <- ape::complement(primer)
    down <- down[!down %in% as.raw(c(2, 4))]
    vitR <- aphid::Viterbi(model, down, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)
    match_type <- NA_character_

    # Calculate longest match length
    rle_path <- rle(vitR$path)
    rle_matches <- rle_path$lengths
    names(rle_matches) <- rle_path$values
    all_matches <- rle_matches[names(rle_matches) == "1"]

    # Check for primers split into multiple matches
    if(length(all_matches) > 1){
      longest_match <- max(all_matches)

      if(which.max(all_matches) == 1){
        down[[1]] <- down[[1]][1:max(all_matches)]
      } else {
        down[[1]] <- down[[1]][all_matches[which.max(all_matches)-1]:all_matches[which.max(all_matches)]]
      }
      # Redo the viterbi alignment with just the subset primer
      vitR <- aphid::Viterbi(model, down, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)
    }
    if (vitF$score > vitR$score & vitF$score > min_score) {
      match_type <- "forward"
      path <- vitF$path
      score <- vitF$score
    } else if (vitF$score < vitR$score & vitR$score > min_score) {
      match_type <- "reverse"
      path <- vitR$path
      score <- vitR$score
    } else if (vitF$score < min_score & vitR$score < min_score) {
      score <- max(vitF$score, vitR$score)
      out <- data.frame(primer = input, start = NA, end = NA, score=score)
      warning("Both complements of primer were below min_score")
      return(out)
    }
  } else if(tryrc == FALSE & vitF$score > min_score) {
    path <- vitF$path
    score <- vitF$score
    match_type <- "forward"
  } else {
    out <- data.frame(primer = input, start = NA, end = NA, score=vitF$score)
    warning("Forward complement of primer was below min_score")
    return(out)
  }

  matchF <- match(1, path)
  matchR <- (length(path) - (match(1, rev(path)) - 1))
  if (!quiet) {
    if(match_type == "forward"){
      message(paste0("Forward complement matched alignment at positions ", matchF, ":",matchR))
    } else if(match_type == "reverse"){
      message(paste0("Reverse complement matched alignment at positions ", matchF, ":",matchR))
    } else {
      message("Both complements did not match alignment")
    }
  }

  if ((matchR - (matchF - 1)) == length(primer[[1]])) {
    out <- data.frame(primer = input, start = matchF, end = matchR, score=score)
  }  else if ((matchR - (matchF - 1)) > length(primer[[1]])) {
    warning("Binding positions are larger than the primer length")
    out <- data.frame(primer = input, start = matchF, end = matchR, score=score)
  }  else if ((matchR - (matchF - 1)) < length(primer[[1]])) {
    warning("Binding positions are less than the primer length")
    out <- data.frame(primer = input, start = matchF, end = matchR, score=score)
  }
  return(out)
}

#' Get subalignment
#'
#' @description Aligns a DNABin to a reference PHMM model, and returns the optimal path
#' @param x A DNAbin object or an object coercible to DNAbin
#' @param model A profile hidden Markov model (a "PHMM" object) generated by the aphid R package to align the sequences to.
#' @param tryrc Whether the reverse complement should also be aligned
#' @param quiet Whether progress should be printed to the console.
#' @param check_indels Check that indels are multiples of 3, recommended for coding sequences such as COI
#' @param min_score Minimum score for the viterbi alignment
#' @param omit_endgaps Should gap characters at each end of the sequences be ignored when deriving the transition probabilities of the model? Defaults to FALSE. Set to TRUE if x is not a strict global alignment (i.e. if the alignment contains partial sequences with missing sections represented with gap characters).
#' @param multithread Whether multithreading should be used, if TRUE the number of cores will be automatically detected, or provided a numeric vector to manually set the number of cores to use
#' @param ... aditional arguments to be passed to "Viterbi"
#'
#' @return
#' @export
#' @import future
#' @importFrom ape as.DNAbin
#' @importFrom ape base.freq
#' @importFrom ape complement
#' @importFrom aphid derivePHMM
#' @importFrom aphid Viterbi
#'
#'
get_subalignment <- function(x, model, tryrc=FALSE, quiet=FALSE, check_indels=TRUE, min_score=10, omit_endgaps	= FALSE, multithread=FALSE, ...) {

  # Ensure x is a DNAbin
  if (!inherits(x, "DNAbin")) {
    x <- ape::as.DNAbin(x)
    if (all(is.na(ape::base.freq(x)))) {
      stop("Error: Object is not coercible to DNAbin \n")
    }
  }

  # setup multithreading
  setup_multithread(multithread = multithread, quiet=quiet)

  up <- aphid::derivePHMM(x, cores = cores, omit.endgaps = omit_endgaps, ...=...)
  vitF <- aphid::Viterbi(model, up, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA")

  # Derive PHMM
  if(tryrc == TRUE) {
    if(!quiet){message("Deriving PHMM for reverse complement")}
    down <- aphid::derivePHMM(ape::complement(x), cores = cores, omit.endgaps = omit_endgaps, ...=...)
    vitR <- aphid::Viterbi(model, down, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA")

    if (vitF$score > vitR$score && vitF$score > min_score) {
      if(!quiet){message("Forward complement matched alignment")}
      path <- vitF$path

    } else if (vitF$score < vitR$score && vitR$score > min_score) {
      if(!quiet){message("Reverse complement matched alignment")}
      path <- vitR$path

    } else if( vitF$score && vitR$score < min_score ){
      return(NULL)
      stop("Both complements of primer were below min_score")

    }
  } else if(tryrc == FALSE && vitF$score > min_score) {
    path <- vitF$path
  } else {Stop("Forward complement of primer was below min_score")}
  return(path)
}


#' Pad alignment
#'
#' @description Aligns a DNABin to a reference PHMM model, and pads any gaps between the query and reference
#' @param x A DNAbin object or an object coercible to DNAbin
#' @param model A profile hidden Markov model (a "PHMM" object) generated by the aphid R package to align the sequences to.
#' @param pad The character used to pad the gaps
#' @param tryrc Whether the reverse complement should also be aligned
#' @param quiet Whether progress should be printed to the console.
#' @param check_indels Check that indels are multiples of 3, recommended for coding sequences such as COI
#' @param min_score Minimum score for the viterbi alignment
#' @param omit_endgaps Should gap characters at each end of the sequences be ignored when deriving the transition probabilities of the model? Defaults to FALSE. Set to TRUE if x is not a strict global alignment (i.e. if the alignment contains partial sequences with missing sections represented with gap characters).
#' @param multithread Whether multithreading should be used, if TRUE the number of cores will be automatically detected, or provided a numeric vector to manually set the number of cores to use
#' @param ... aditional arguments to be passed to "Viterbi"
#'
#' @return
#' @export
#' @import stringr
#'
#'
pad_alignment <- function(x, model, pad = "-", tryrc = FALSE, check_indels = TRUE, min_score = 10, omit_endgaps	= FALSE, multithread = FALSE,  quiet = FALSE, ...){

  path <- get_subalignment(x = x, model = model, tryrc = tryrc, check_indels = check_indels,
                           min_score = min_score, omit_endgaps=omit_endgaps, multithread = multithread, quiet = quiet, ...=...)

  # Find start, stop, and indels
  matchF <- match(2, path)
  matchR <- (length(path) - (match(2, rev(path))-1))
  indels <- which(path == 0, arr.ind=FALSE)
  # potentially could have indels as 1's as well, due to potential for indels to be recorded in PhMM?
  # Do another check for which(path ==1, arr.ind=FALSE), and make sure they are more than matchF and less than matchR to be recorded as indels

  # Pad Left
  if (matchF > 1){
    left_pad <- which(path == 1, arr.ind=FALSE)
    left_pad <- left_pad[which(left_pad < matchF)]
  } else(left_pad <- NULL)

  # Detect indels
  if (length(indels) > 1) {
    # Detect multiple indels
    splitAt <- function(x, pos) unname(split(x, cumsum(seq_along(x) %in% pos)))
    split <- splitAt(indels, which(diff(indels) > 1, arr.ind=FALSE)+1)

    # Confirm all indels are 1 codon deletions
    if (check_indels == TRUE && !all(sapply(split, length) %in% seq(3,12,3))) {
      stop("ERROR: Indels are not in multiples of 3!")
    }
  } else (indels <- NULL)

  # Pad right
  if (matchR < length(path)){
    right_pad <- which(path == 1, arr.ind = FALSE)
    right_pad <- right_pad[which(right_pad > matchR)]
  } else (right_pad <- NULL)

  insert <- c(left_pad, indels+length(left_pad), right_pad+length(left_pad)+length(indels))

  x <- as.character(x)
  insert_at <- function(x, index) {
    x <-  paste0(x, collapse = "")
    for(i in 1:length(index)) {
      stringr::str_sub(x, start = index[i], end = index[i]-1) <- pad
    }
    x <- stringr::str_to_upper(x)
    return(x)
  }
  out <- sapply(x, insert_at, insert)
  out <- char2DNAbin(out)
  return(out)
}



#' Subset PHMM
#' Modified from Cameron Nugent's COIL package https://github.com/CNuge/coil/blob/master/R/subset_PHMM.r
#' @param x A profile hidden Markov model (a "PHMM" object) generated by the aphid R package.
#' @param primers A character vector of length 2 contaning the forward and reverse primer sequences.
#' @param positions A numeric vector of length 2 containing the start and end position for model trimming. Used only if primers = NULL.
#' @param trimprimers logical indicating whether the primer-binding sites should be removed from the sequences in the returned model. Used only if positions = NULL.
#' @param min_score Minimum score for the viterbi alignment.
#' @param quiet Whether to print progress to console.
#'
#' @return
#' @export
#'
#' @examples
subset_model <- function(x, primers=NULL, positions=NULL, trimprimers=FALSE, min_score=10, quiet=FALSE){

  # Check inputs
  if (is.numeric(positions) & length(positions) == 2){
    start_bind <- sort(positions)[1]
    end_bind <- sort(positions)[2]
  } else if (is.null(positions) & (is.character(primers) & length(primers)== 2)){
    Fprimer <- primers[1]
    Rprimer <- primers[2]

    Fbind <-  get_binding_position(Fprimer, model=x, tryRC=TRUE, quiet=quiet, min_score = min_score)
    Rbind <-  get_binding_position(Rprimer, model=x, tryRC=TRUE, quiet=quiet, min_score = min_score)
    if(isTRUE(trimprimers)){
      start_bind <- Fbind$end + 1
      end_bind <- Rbind$start - 1
      if(!quiet){message("Primer binding positions will be trimmed from model")}
    } else if(isFALSE(trimprimers)){
      start_bind <- Fbind$start
      end_bind <- Rbind$end
    } else{
      stop("trimprimers must be either TRUE or FALSE")
    }

  } else if (is.null(positions) & is.null(primers)){
    stop("A value must be provided to either positions or primers")
  } else if(is.null(primers) & (!is.numeric(positions) | !length(positions) == 2) ){
    stop("positions must be a numeric vector of length 2 containing the start and end position for model trimming")
  } else if(is.null(positions) & (!is.character(primers) | !length(primers) == 2) ){
    stop("primers must be a character vector of length 2, with the forward primer as the first element, and reverse primer as the second element")
  } else if(!is.null(primers) & !is.null(positions)){
    stop("Only one of primers or positions can be entered at a time")
  } else {
    stop("A value must be provided to either positions or primers")
  }
  if(is.na(start_bind) & is.numeric(end_bind)){
    stop(paste0("Could not find binding position for Forward primer: ", Fprimer))
  } else if(is.na(end_bind) & is.numeric(start_bind)){
    stop(paste0("Could not find binding position for Reverse primer: ", Rprimer))
  } else if(is.na(start_bind) & is.na(end_bind)){
    stop(paste0("Could not find binding position for Forward primer: ", Fprimer, " and Reverse primer: ", Rprimer))
  }
  # Check that positions are right
  if(end_bind < start_bind){
    stop("Index error, end position must match or exceed the start position.")
  }
  if(end_bind > x$size){
    stop(paste0("Index error, end position out of bounds. Input PHMM only has a length of: ", x$size))
  }

  #get the map positions for subsetting the alignment-based fields
  map_start <- x$map[[start_bind]]
  map_end <- x$map[[end_bind]]

  #subset to get the new inserts
  new_inserts <- x$inserts[map_start:map_end]

  #subset to get the new insert lengths
  new_insert_lengths <- x$insertlengths[map_start:map_end]
  #reindex the insert length labels
  if(!is.null(new_insert_lengths)){
    names(new_insert_lengths) <- 0:(length(new_insert_lengths)-1)
  }
  #subset the mask
  new_mask <- x$mask[map_start:map_end]
  #subset the map
  new_map <- x$map[start_bind:end_bind]

  #set the first remaining map entry equal to one,
  #make the necessary relative adjustment to all other positions
  new_map <- new_map-(new_map[1]-1)
  #reindex the labels
  if(!is.null(new_map)){
    names(new_map) <- 1:(length(new_map))
  }

  #Subset the A and E matrixies, reindex the positions
  #A is 0 indexed, E is 1 indexed
  new_A <- x$A[,start_bind:end_bind]
  colnames(new_A) <- 0:(length(colnames(new_A))-1)
  new_E <- x$E[,start_bind:end_bind]
  colnames(new_E) <- 1:length(colnames(new_A))

  #Create the new PHMM object
  newPHMM = structure(list(name = x$name,
                           description = x$description,
                           size = (end_bind - start_bind)+1,
                           alphabet = x$alphabet,
                           A = new_A,
                           E = new_E,
                           qa = x$qa,
                           qe = x$qe,
                           inserts = new_inserts,
                           insertlengths = new_insert_lengths,
                           map = new_map,
                           date = x$date,
                           nseq = x$nseq,
                           weights = x$weights,
                           reference = x$reference,
                           mask = new_mask
  ), class =  "PHMM")
  message(paste0("PHMM model subset to ", newPHMM$size, " base pairs"))
  return(newPHMM)
}

# Alignment entropy -------------------------------------------------------

#' Alignment entropy
#'
#' @param x A DNAbin or AAbin object
#' @param mask_gaps The threshold of gaps allowed before a position in the alignment is masked
#' @param count_gaps Whether gaps should be counted within entropy calculations. Default is FALSE.
#' @param method 	the method employed by `entropy::entropy` to estimate alignment entropy. Accepts:
#' "ML" : maximum likelihood
#' "MM" : bias-corrected maximum likelihood,
#' "Jeffreys" : Dirichlet with a=1/2
#' "Laplace" : Dirichlet with a=1
#' "SG" : Dirichlet with a=a=1/length(y)
#' "minimax" : Dirichlet with a=sqrt(sum(y))/length(y)
#' "CS" : ChaoShen
#' "NSB": Nemenman, Shafee and Biale (2002)
#' "shrink" : Shrinkage estimator
#' See the help page of `entropy::entropy` for more information
#' @param unit the unit in which entropy is measured. The default is "nats" (natural units). For computing entropy in "bits" set unit="log2".
#' @param return_extra Whether to return a dataframe including extra columns including individual base counts, gap proportions and number of bases at each position
#'
#' @return
#' @export
#' @import purrr
#' @importFrom entropy entropy
#'
#'
#'
alignment_entropy <- function (x, mask_gaps=0.2, count_gaps = FALSE, method="ML", unit="log", return_extra=FALSE) {
  if ((mask_gaps < 0) | (mask_gaps > 1)) {
    stop("mask_gaps should be a percentage (within the [0,1] range).")
  }

  if(class(x)=="DNAbin"){
    x <- as.character(x)
    x <- lapply(x, toupper)
    names <- c("A", "C", "G", "T", "-")
  } else if(class(x)=="AAbin"){
    x <- as.character(x)
    x <- lapply(x, toupper)
    names <- c("A", "C", "D", "E", "F",
               "G", "H", "I", "K", "L",
               "M", "N", "P", "Q", "R",
               "S", "T", "V", "W", "Y", "-")
  }

  if(count_gaps){
    counter <- names
  } else if(!count_gaps){
    counter <- names[!names=="-"]
  }

  #Create matrix
  suppressWarnings(MSA <- matrix(as.vector(unlist(x)), ncol = length(x[[1]]), byrow = TRUE))

  n_seq <- length(MSA[,1])
  n_pos <- length(MSA[1,])

  #Summarise each position in alignment
  i=1
  df <- data.frame(matrix(ncol = length(names), nrow = n_pos))
  colnames(df) <- names

  for(i in 1:n_pos){
    df[i,names] <- sapply(names, function(x){ sum(MSA[, i] == x)})
  }
  ent <- df %>%
    dplyr::mutate(pos = rownames(.),
                  bases = n_seq - `-`,
                  prop_gaps = `-` / n_seq,
                  together = pmap(unname(.[counter]), c)) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(ent = entropy::entropy(together, method=method, unit=unit)) %>%
    dplyr::mutate(ent = dplyr::case_when(
      prop_gaps > mask_gaps ~ as.numeric(NA),
      prop_gaps <= mask_gaps ~ ent
    )) %>%
    dplyr::select(-together)

  if(return_extra){
    out <- ent
  } else if(!return_extra){
    out <- ent$ent
    names(out) <- ent$pos
  }
  message("Masked ", sum(is.na(ent)), " alignment positions with over ", (mask_gaps*100), "% gaps")
  return(out)
}


# Summarise fasta ---------------------------------------------------------

#' summarise_fasta
#'
#' @param x The location of a fasta file or gzipped fasta file.
#' @param label optional, Add an extra column with a label
#' @param origin optional, a table with sequence id numbers and their database origins
#'
#' @return
#' @export
#' @import stringr
#' @import dplyr
#' @importFrom Biostrings fasta.index
#' @importFrom stats quantile
#'
#'
summarise_fasta <- function(x, label=NULL, origin=NULL) {
  if(is.null(origin)){
    out <- Biostrings::fasta.index(x) %>%
      dplyr::mutate(taxid = desc %>%
                      stringr::str_remove(pattern="(;)(.*?)(?=$)")  %>%
                      stringr::str_remove(pattern="(^)(.*?)(?<=\\|)")) %>%
      dplyr::summarise(nseqs = n(),
                       nspecies=n_distinct(taxid),
                       mean_length = mean(seqlength),
                       q0 = stats::quantile(seqlength, probs=0),
                       q25 = stats::quantile(seqlength, probs=0.25),
                       q50 = stats::quantile(seqlength, probs=0.5), # Q50 is median
                       q75 = stats::quantile(seqlength, probs=0.75),
                       q100 = stats::quantile(seqlength, probs=1)
      )

  } else if(is.data.frame(origin) | is_tibble(origin)){

    if(any(duplicated(origin$seqid))){
      stop("Origin table has duplicated seqids")
    }
    out <- Biostrings::fasta.index(x) %>%
      dplyr::mutate(taxid = desc %>%
                      stringr::str_remove(pattern="(;)(.*?)(?=$)")  %>%
                      stringr::str_remove(pattern="(^)(.*?)(?<=\\|)")) %>%
      dplyr::mutate(seqid = desc %>%
                      stringr::str_remove(pattern="(\\|)(.*?)(?=$)"))  %>%
      dplyr::left_join(origin, by="seqid") %>%
      dplyr::group_by(taxid) %>%
      dplyr::mutate(origin = paste(sort(unique(origin)), collapse="/"))%>% #Combine origins to make sure they arent duplicated
      dplyr::ungroup() %>%
      dplyr::group_by(origin) %>%
      dplyr::summarise(nseqs = n(),
                       nspecies=n_distinct(taxid),
                       mean_length = mean(seqlength),
                       q0 = stats::quantile(seqlength, probs=0),
                       q25 = stats::quantile(seqlength, probs=0.25),
                       q50 = stats::quantile(seqlength, probs=0.5), # Q50 is median
                       q75 = stats::quantile(seqlength, probs=0.75),
                       q100 = stats::quantile(seqlength, probs=1)
      )
  }
  if(is.character(label)) {
    out <- out %>%
      dplyr::mutate(label  = label)
  }
  return(out)
}
alexpiper/taxreturn documentation built on Sept. 14, 2024, 7:56 p.m.