# Alignment utilities -----------------------------------------------------
#' Get primer binding position
#'
#' @param primer A character string, DNAbin object or an object coercible to DNAbin
#' @param model A profile hidden Markov model (a "PHMM" object) generated by the aphid R package to align the sequences to.
#' @param tryrc Whether the reverse complement should also be aligned. The highest scoring complement is chosen.
#' @param quiet Whether progress should be printed to the console.
#' @param min_score Minimum score for the viterbi alignment.
#' @param ... aditional arguments to be passed to "Viterbi"
#'
#' @return
#' @export
#' @import stringr
#' @importFrom ape complement
#' @importFrom aphid Viterbi
#'
get_binding_position <- function (primer, model, tryrc = TRUE, quiet = FALSE, min_score = 10, ...) {
input <- primer
if (!inherits(model, "PHMM")) { stop("Error: model must be a PHMM object")}
if (!is.null(primer)) {
if (!inherits(primer, "DNAbin")) {
if (mode(primer) == "character") {
if (nchar(primer[1]) == 1) {primer <- paste0(primer, collapse = "")}
if(stringr::str_detect(primer, "I")) {message(paste0("Warning: Inosine (I) bases detected in primer ", input," these will be converted to N!"))}
primer <- char2DNAbin(primer)
}
else {
if (!inherits(primer, "PHMM"))
stop("Invalid primer(s)\n")
}
}
}
up <- primer[!primer %in% as.raw(c(2, 4))]
vitF <- aphid::Viterbi(model, up, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)
# Calculate longest match length
rle_path <- rle(vitF$path)
rle_matches <- rle_path$lengths
names(rle_matches) <- rle_path$values
all_matches <- rle_matches[names(rle_matches) == "1"]
# Check for primers split into multiple matches
if(length(all_matches) > 1){
longest_match <- max(all_matches)
if(which.max(all_matches) == 1){
up[[1]] <- up[[1]][1:max(all_matches)]
} else {
up[[1]] <- up[[1]][all_matches[which.max(all_matches)-1]:all_matches[which.max(all_matches)]]
}
# Redo the viterbi alignment with just the subset primer
vitF <- aphid::Viterbi(model, up, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)
}
if (tryrc == TRUE) {
down <- ape::complement(primer)
down <- down[!down %in% as.raw(c(2, 4))]
vitR <- aphid::Viterbi(model, down, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)
match_type <- NA_character_
# Calculate longest match length
rle_path <- rle(vitR$path)
rle_matches <- rle_path$lengths
names(rle_matches) <- rle_path$values
all_matches <- rle_matches[names(rle_matches) == "1"]
# Check for primers split into multiple matches
if(length(all_matches) > 1){
longest_match <- max(all_matches)
if(which.max(all_matches) == 1){
down[[1]] <- down[[1]][1:max(all_matches)]
} else {
down[[1]] <- down[[1]][all_matches[which.max(all_matches)-1]:all_matches[which.max(all_matches)]]
}
# Redo the viterbi alignment with just the subset primer
vitR <- aphid::Viterbi(model, down, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA", ...=...)
}
if (vitF$score > vitR$score & vitF$score > min_score) {
match_type <- "forward"
path <- vitF$path
score <- vitF$score
} else if (vitF$score < vitR$score & vitR$score > min_score) {
match_type <- "reverse"
path <- vitR$path
score <- vitR$score
} else if (vitF$score < min_score & vitR$score < min_score) {
score <- max(vitF$score, vitR$score)
out <- data.frame(primer = input, start = NA, end = NA, score=score)
warning("Both complements of primer were below min_score")
return(out)
}
} else if(tryrc == FALSE & vitF$score > min_score) {
path <- vitF$path
score <- vitF$score
match_type <- "forward"
} else {
out <- data.frame(primer = input, start = NA, end = NA, score=vitF$score)
warning("Forward complement of primer was below min_score")
return(out)
}
matchF <- match(1, path)
matchR <- (length(path) - (match(1, rev(path)) - 1))
if (!quiet) {
if(match_type == "forward"){
message(paste0("Forward complement matched alignment at positions ", matchF, ":",matchR))
} else if(match_type == "reverse"){
message(paste0("Reverse complement matched alignment at positions ", matchF, ":",matchR))
} else {
message("Both complements did not match alignment")
}
}
if ((matchR - (matchF - 1)) == length(primer[[1]])) {
out <- data.frame(primer = input, start = matchF, end = matchR, score=score)
} else if ((matchR - (matchF - 1)) > length(primer[[1]])) {
warning("Binding positions are larger than the primer length")
out <- data.frame(primer = input, start = matchF, end = matchR, score=score)
} else if ((matchR - (matchF - 1)) < length(primer[[1]])) {
warning("Binding positions are less than the primer length")
out <- data.frame(primer = input, start = matchF, end = matchR, score=score)
}
return(out)
}
#' Get subalignment
#'
#' @description Aligns a DNABin to a reference PHMM model, and returns the optimal path
#' @param x A DNAbin object or an object coercible to DNAbin
#' @param model A profile hidden Markov model (a "PHMM" object) generated by the aphid R package to align the sequences to.
#' @param tryrc Whether the reverse complement should also be aligned
#' @param quiet Whether progress should be printed to the console.
#' @param check_indels Check that indels are multiples of 3, recommended for coding sequences such as COI
#' @param min_score Minimum score for the viterbi alignment
#' @param omit_endgaps Should gap characters at each end of the sequences be ignored when deriving the transition probabilities of the model? Defaults to FALSE. Set to TRUE if x is not a strict global alignment (i.e. if the alignment contains partial sequences with missing sections represented with gap characters).
#' @param multithread Whether multithreading should be used, if TRUE the number of cores will be automatically detected, or provided a numeric vector to manually set the number of cores to use
#' @param ... aditional arguments to be passed to "Viterbi"
#'
#' @return
#' @export
#' @import future
#' @importFrom ape as.DNAbin
#' @importFrom ape base.freq
#' @importFrom ape complement
#' @importFrom aphid derivePHMM
#' @importFrom aphid Viterbi
#'
#'
get_subalignment <- function(x, model, tryrc=FALSE, quiet=FALSE, check_indels=TRUE, min_score=10, omit_endgaps = FALSE, multithread=FALSE, ...) {
# Ensure x is a DNAbin
if (!inherits(x, "DNAbin")) {
x <- ape::as.DNAbin(x)
if (all(is.na(ape::base.freq(x)))) {
stop("Error: Object is not coercible to DNAbin \n")
}
}
# setup multithreading
setup_multithread(multithread = multithread, quiet=quiet)
up <- aphid::derivePHMM(x, cores = cores, omit.endgaps = omit_endgaps, ...=...)
vitF <- aphid::Viterbi(model, up, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA")
# Derive PHMM
if(tryrc == TRUE) {
if(!quiet){message("Deriving PHMM for reverse complement")}
down <- aphid::derivePHMM(ape::complement(x), cores = cores, omit.endgaps = omit_endgaps, ...=...)
vitR <- aphid::Viterbi(model, down, odds = TRUE, type = "semiglobal", cpp = TRUE, residues = "DNA")
if (vitF$score > vitR$score && vitF$score > min_score) {
if(!quiet){message("Forward complement matched alignment")}
path <- vitF$path
} else if (vitF$score < vitR$score && vitR$score > min_score) {
if(!quiet){message("Reverse complement matched alignment")}
path <- vitR$path
} else if( vitF$score && vitR$score < min_score ){
return(NULL)
stop("Both complements of primer were below min_score")
}
} else if(tryrc == FALSE && vitF$score > min_score) {
path <- vitF$path
} else {Stop("Forward complement of primer was below min_score")}
return(path)
}
#' Pad alignment
#'
#' @description Aligns a DNABin to a reference PHMM model, and pads any gaps between the query and reference
#' @param x A DNAbin object or an object coercible to DNAbin
#' @param model A profile hidden Markov model (a "PHMM" object) generated by the aphid R package to align the sequences to.
#' @param pad The character used to pad the gaps
#' @param tryrc Whether the reverse complement should also be aligned
#' @param quiet Whether progress should be printed to the console.
#' @param check_indels Check that indels are multiples of 3, recommended for coding sequences such as COI
#' @param min_score Minimum score for the viterbi alignment
#' @param omit_endgaps Should gap characters at each end of the sequences be ignored when deriving the transition probabilities of the model? Defaults to FALSE. Set to TRUE if x is not a strict global alignment (i.e. if the alignment contains partial sequences with missing sections represented with gap characters).
#' @param multithread Whether multithreading should be used, if TRUE the number of cores will be automatically detected, or provided a numeric vector to manually set the number of cores to use
#' @param ... aditional arguments to be passed to "Viterbi"
#'
#' @return
#' @export
#' @import stringr
#'
#'
pad_alignment <- function(x, model, pad = "-", tryrc = FALSE, check_indels = TRUE, min_score = 10, omit_endgaps = FALSE, multithread = FALSE, quiet = FALSE, ...){
path <- get_subalignment(x = x, model = model, tryrc = tryrc, check_indels = check_indels,
min_score = min_score, omit_endgaps=omit_endgaps, multithread = multithread, quiet = quiet, ...=...)
# Find start, stop, and indels
matchF <- match(2, path)
matchR <- (length(path) - (match(2, rev(path))-1))
indels <- which(path == 0, arr.ind=FALSE)
# potentially could have indels as 1's as well, due to potential for indels to be recorded in PhMM?
# Do another check for which(path ==1, arr.ind=FALSE), and make sure they are more than matchF and less than matchR to be recorded as indels
# Pad Left
if (matchF > 1){
left_pad <- which(path == 1, arr.ind=FALSE)
left_pad <- left_pad[which(left_pad < matchF)]
} else(left_pad <- NULL)
# Detect indels
if (length(indels) > 1) {
# Detect multiple indels
splitAt <- function(x, pos) unname(split(x, cumsum(seq_along(x) %in% pos)))
split <- splitAt(indels, which(diff(indels) > 1, arr.ind=FALSE)+1)
# Confirm all indels are 1 codon deletions
if (check_indels == TRUE && !all(sapply(split, length) %in% seq(3,12,3))) {
stop("ERROR: Indels are not in multiples of 3!")
}
} else (indels <- NULL)
# Pad right
if (matchR < length(path)){
right_pad <- which(path == 1, arr.ind = FALSE)
right_pad <- right_pad[which(right_pad > matchR)]
} else (right_pad <- NULL)
insert <- c(left_pad, indels+length(left_pad), right_pad+length(left_pad)+length(indels))
x <- as.character(x)
insert_at <- function(x, index) {
x <- paste0(x, collapse = "")
for(i in 1:length(index)) {
stringr::str_sub(x, start = index[i], end = index[i]-1) <- pad
}
x <- stringr::str_to_upper(x)
return(x)
}
out <- sapply(x, insert_at, insert)
out <- char2DNAbin(out)
return(out)
}
#' Subset PHMM
#' Modified from Cameron Nugent's COIL package https://github.com/CNuge/coil/blob/master/R/subset_PHMM.r
#' @param x A profile hidden Markov model (a "PHMM" object) generated by the aphid R package.
#' @param primers A character vector of length 2 contaning the forward and reverse primer sequences.
#' @param positions A numeric vector of length 2 containing the start and end position for model trimming. Used only if primers = NULL.
#' @param trimprimers logical indicating whether the primer-binding sites should be removed from the sequences in the returned model. Used only if positions = NULL.
#' @param min_score Minimum score for the viterbi alignment.
#' @param quiet Whether to print progress to console.
#'
#' @return
#' @export
#'
#' @examples
subset_model <- function(x, primers=NULL, positions=NULL, trimprimers=FALSE, min_score=10, quiet=FALSE){
# Check inputs
if (is.numeric(positions) & length(positions) == 2){
start_bind <- sort(positions)[1]
end_bind <- sort(positions)[2]
} else if (is.null(positions) & (is.character(primers) & length(primers)== 2)){
Fprimer <- primers[1]
Rprimer <- primers[2]
Fbind <- get_binding_position(Fprimer, model=x, tryRC=TRUE, quiet=quiet, min_score = min_score)
Rbind <- get_binding_position(Rprimer, model=x, tryRC=TRUE, quiet=quiet, min_score = min_score)
if(isTRUE(trimprimers)){
start_bind <- Fbind$end + 1
end_bind <- Rbind$start - 1
if(!quiet){message("Primer binding positions will be trimmed from model")}
} else if(isFALSE(trimprimers)){
start_bind <- Fbind$start
end_bind <- Rbind$end
} else{
stop("trimprimers must be either TRUE or FALSE")
}
} else if (is.null(positions) & is.null(primers)){
stop("A value must be provided to either positions or primers")
} else if(is.null(primers) & (!is.numeric(positions) | !length(positions) == 2) ){
stop("positions must be a numeric vector of length 2 containing the start and end position for model trimming")
} else if(is.null(positions) & (!is.character(primers) | !length(primers) == 2) ){
stop("primers must be a character vector of length 2, with the forward primer as the first element, and reverse primer as the second element")
} else if(!is.null(primers) & !is.null(positions)){
stop("Only one of primers or positions can be entered at a time")
} else {
stop("A value must be provided to either positions or primers")
}
if(is.na(start_bind) & is.numeric(end_bind)){
stop(paste0("Could not find binding position for Forward primer: ", Fprimer))
} else if(is.na(end_bind) & is.numeric(start_bind)){
stop(paste0("Could not find binding position for Reverse primer: ", Rprimer))
} else if(is.na(start_bind) & is.na(end_bind)){
stop(paste0("Could not find binding position for Forward primer: ", Fprimer, " and Reverse primer: ", Rprimer))
}
# Check that positions are right
if(end_bind < start_bind){
stop("Index error, end position must match or exceed the start position.")
}
if(end_bind > x$size){
stop(paste0("Index error, end position out of bounds. Input PHMM only has a length of: ", x$size))
}
#get the map positions for subsetting the alignment-based fields
map_start <- x$map[[start_bind]]
map_end <- x$map[[end_bind]]
#subset to get the new inserts
new_inserts <- x$inserts[map_start:map_end]
#subset to get the new insert lengths
new_insert_lengths <- x$insertlengths[map_start:map_end]
#reindex the insert length labels
if(!is.null(new_insert_lengths)){
names(new_insert_lengths) <- 0:(length(new_insert_lengths)-1)
}
#subset the mask
new_mask <- x$mask[map_start:map_end]
#subset the map
new_map <- x$map[start_bind:end_bind]
#set the first remaining map entry equal to one,
#make the necessary relative adjustment to all other positions
new_map <- new_map-(new_map[1]-1)
#reindex the labels
if(!is.null(new_map)){
names(new_map) <- 1:(length(new_map))
}
#Subset the A and E matrixies, reindex the positions
#A is 0 indexed, E is 1 indexed
new_A <- x$A[,start_bind:end_bind]
colnames(new_A) <- 0:(length(colnames(new_A))-1)
new_E <- x$E[,start_bind:end_bind]
colnames(new_E) <- 1:length(colnames(new_A))
#Create the new PHMM object
newPHMM = structure(list(name = x$name,
description = x$description,
size = (end_bind - start_bind)+1,
alphabet = x$alphabet,
A = new_A,
E = new_E,
qa = x$qa,
qe = x$qe,
inserts = new_inserts,
insertlengths = new_insert_lengths,
map = new_map,
date = x$date,
nseq = x$nseq,
weights = x$weights,
reference = x$reference,
mask = new_mask
), class = "PHMM")
message(paste0("PHMM model subset to ", newPHMM$size, " base pairs"))
return(newPHMM)
}
# Alignment entropy -------------------------------------------------------
#' Alignment entropy
#'
#' @param x A DNAbin or AAbin object
#' @param mask_gaps The threshold of gaps allowed before a position in the alignment is masked
#' @param count_gaps Whether gaps should be counted within entropy calculations. Default is FALSE.
#' @param method the method employed by `entropy::entropy` to estimate alignment entropy. Accepts:
#' "ML" : maximum likelihood
#' "MM" : bias-corrected maximum likelihood,
#' "Jeffreys" : Dirichlet with a=1/2
#' "Laplace" : Dirichlet with a=1
#' "SG" : Dirichlet with a=a=1/length(y)
#' "minimax" : Dirichlet with a=sqrt(sum(y))/length(y)
#' "CS" : ChaoShen
#' "NSB": Nemenman, Shafee and Biale (2002)
#' "shrink" : Shrinkage estimator
#' See the help page of `entropy::entropy` for more information
#' @param unit the unit in which entropy is measured. The default is "nats" (natural units). For computing entropy in "bits" set unit="log2".
#' @param return_extra Whether to return a dataframe including extra columns including individual base counts, gap proportions and number of bases at each position
#'
#' @return
#' @export
#' @import purrr
#' @importFrom entropy entropy
#'
#'
#'
alignment_entropy <- function (x, mask_gaps=0.2, count_gaps = FALSE, method="ML", unit="log", return_extra=FALSE) {
if ((mask_gaps < 0) | (mask_gaps > 1)) {
stop("mask_gaps should be a percentage (within the [0,1] range).")
}
if(class(x)=="DNAbin"){
x <- as.character(x)
x <- lapply(x, toupper)
names <- c("A", "C", "G", "T", "-")
} else if(class(x)=="AAbin"){
x <- as.character(x)
x <- lapply(x, toupper)
names <- c("A", "C", "D", "E", "F",
"G", "H", "I", "K", "L",
"M", "N", "P", "Q", "R",
"S", "T", "V", "W", "Y", "-")
}
if(count_gaps){
counter <- names
} else if(!count_gaps){
counter <- names[!names=="-"]
}
#Create matrix
suppressWarnings(MSA <- matrix(as.vector(unlist(x)), ncol = length(x[[1]]), byrow = TRUE))
n_seq <- length(MSA[,1])
n_pos <- length(MSA[1,])
#Summarise each position in alignment
i=1
df <- data.frame(matrix(ncol = length(names), nrow = n_pos))
colnames(df) <- names
for(i in 1:n_pos){
df[i,names] <- sapply(names, function(x){ sum(MSA[, i] == x)})
}
ent <- df %>%
dplyr::mutate(pos = rownames(.),
bases = n_seq - `-`,
prop_gaps = `-` / n_seq,
together = pmap(unname(.[counter]), c)) %>%
dplyr::rowwise() %>%
dplyr::mutate(ent = entropy::entropy(together, method=method, unit=unit)) %>%
dplyr::mutate(ent = dplyr::case_when(
prop_gaps > mask_gaps ~ as.numeric(NA),
prop_gaps <= mask_gaps ~ ent
)) %>%
dplyr::select(-together)
if(return_extra){
out <- ent
} else if(!return_extra){
out <- ent$ent
names(out) <- ent$pos
}
message("Masked ", sum(is.na(ent)), " alignment positions with over ", (mask_gaps*100), "% gaps")
return(out)
}
# Summarise fasta ---------------------------------------------------------
#' summarise_fasta
#'
#' @param x The location of a fasta file or gzipped fasta file.
#' @param label optional, Add an extra column with a label
#' @param origin optional, a table with sequence id numbers and their database origins
#'
#' @return
#' @export
#' @import stringr
#' @import dplyr
#' @importFrom Biostrings fasta.index
#' @importFrom stats quantile
#'
#'
summarise_fasta <- function(x, label=NULL, origin=NULL) {
if(is.null(origin)){
out <- Biostrings::fasta.index(x) %>%
dplyr::mutate(taxid = desc %>%
stringr::str_remove(pattern="(;)(.*?)(?=$)") %>%
stringr::str_remove(pattern="(^)(.*?)(?<=\\|)")) %>%
dplyr::summarise(nseqs = n(),
nspecies=n_distinct(taxid),
mean_length = mean(seqlength),
q0 = stats::quantile(seqlength, probs=0),
q25 = stats::quantile(seqlength, probs=0.25),
q50 = stats::quantile(seqlength, probs=0.5), # Q50 is median
q75 = stats::quantile(seqlength, probs=0.75),
q100 = stats::quantile(seqlength, probs=1)
)
} else if(is.data.frame(origin) | is_tibble(origin)){
if(any(duplicated(origin$seqid))){
stop("Origin table has duplicated seqids")
}
out <- Biostrings::fasta.index(x) %>%
dplyr::mutate(taxid = desc %>%
stringr::str_remove(pattern="(;)(.*?)(?=$)") %>%
stringr::str_remove(pattern="(^)(.*?)(?<=\\|)")) %>%
dplyr::mutate(seqid = desc %>%
stringr::str_remove(pattern="(\\|)(.*?)(?=$)")) %>%
dplyr::left_join(origin, by="seqid") %>%
dplyr::group_by(taxid) %>%
dplyr::mutate(origin = paste(sort(unique(origin)), collapse="/"))%>% #Combine origins to make sure they arent duplicated
dplyr::ungroup() %>%
dplyr::group_by(origin) %>%
dplyr::summarise(nseqs = n(),
nspecies=n_distinct(taxid),
mean_length = mean(seqlength),
q0 = stats::quantile(seqlength, probs=0),
q25 = stats::quantile(seqlength, probs=0.25),
q50 = stats::quantile(seqlength, probs=0.5), # Q50 is median
q75 = stats::quantile(seqlength, probs=0.75),
q100 = stats::quantile(seqlength, probs=1)
)
}
if(is.character(label)) {
out <- out %>%
dplyr::mutate(label = label)
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.