library(Biostrings)
#library(muscle)
library(tidyverse)
library(ape)
library(pegas)
library(future.apply)
library(data.table)
#' function to align sequences with MUSCLE
#' @export
align_sequence <- function(sequences, MinimumNumberOfSequencesBySpecies) {
seqStringSet <- Biostrings::DNAStringSet(as.vector(sequences))
## check if at least 2 sequences
if(length(seqStringSet@ranges) < MinimumNumberOfSequencesBySpecies) {
return(NA)
} else {
#alignement <- muscle::muscle(seqStringSet, quiet=TRUE)
seqStringSetNoGap <- DECIPHER::RemoveGaps(seqStringSet,
removeGaps = "all",
processors = 1)
alignement <- DECIPHER::AlignSeqs(seqStringSetNoGap, verbose= FALSE)
#alignement <- msa::msaMuscle(seqStringSet)
return(alignement)
}
}
#' function to calculate div nuc
#' @export
nuc_diversity_alignment <- function(alignment) {
if(typeof(alignment) != "logical") {
return(pegas::nuc.div(ape::as.DNAbin(alignment)))
} else {
return(NA)
}
}
#' nucleotide diversity by species
#' arguments:
#' * infobold a dataframe with taxonomy+sequence dna information
#' * buffer.Within a dataframe with row as indiv seq and col as rls pts
#' with TRUE/FALSE presence/absence id of sequence infobold within a given buffer around a given rls pts
#' * rls.pts.index index of the RLS pts
#' it calculates
#' * alignment of sequences by species
#' * number of sequences by species
#' * nucleotide diversity
#' it returns
#' a dataframe with taxonomy and nucleotide diversity and number of sequences by species
#' species with not enough sequences to calculate nucleotide diversity are removed
#' @export
species_nucleotide_diversity_pts <- function(rls.pts.index, infobold, boldWithinRLS, MinimumNumberOfSequencesBySpecies) {
seqboldwithin <- infobold[which(boldWithinRLS[, rls.pts.index] == TRUE),]
seqboldwithinGroupSpecies <- seqboldwithin %>% dplyr::group_by(species_name)
seqboldwithinUniqSpecies <- seqboldwithinGroupSpecies %>% dplyr::slice(1) %>% dplyr::ungroup()
numberSequencesAli <- seqboldwithinGroupSpecies %>% dplyr::tally()
groupSeq <- seqboldwithinGroupSpecies %>% dplyr::group_map(~Biostrings::DNAStringSet(as.vector(.x$sequence)))
alignSeq <- purrr::map(groupSeq, ~ align_sequence(.x, MinimumNumberOfSequencesBySpecies))
nucDivAll <- alignSeq %>% purrr::map(nuc_diversity_alignment) %>% unlist()
seqboldwNucDiv <- data.table::data.table(species_name=seqboldwithinUniqSpecies$species_name,
fishbase_species_name=seqboldwithinUniqSpecies$fishbase_species_name,
genus_name=seqboldwithinUniqSpecies$genus_name,
family_name=seqboldwithinUniqSpecies$family_name,
order_name=seqboldwithinUniqSpecies$order_name,
class_name=seqboldwithinUniqSpecies$class_name,
nucDiv=nucDivAll,
number_individuals=numberSequencesAli$n,
rls.pts.index=rep(rls.pts.index,length(nucDivAll))
)
seqboldwNucDiv <- seqboldwNucDiv[which(!is.na(seqboldwNucDiv$nucDiv)),]
print("done")
## clear
print(rls.pts.index)
return(seqboldwNucDiv)
}
#' number of bold sequence into a buffer around a RLS point
#' @export
number_sequences_by_points <- function(sequenceWithinBuffer) {
numberOfSequencesByPoints <- data.frame(apply(sequenceWithinBuffer, 2, function(x) length(which(x))))
names(numberOfSequencesByPoints) <- "numberOfSequencesPoints"
return(numberOfSequencesByPoints)
}
#' nucleotide diversity by species applied to all RLS pts
#' @export
species_nucleotide_diversity <- function(infobold, sequenceWithinBuffer, MinimumNumberOfSequencesBySpecies) {
## keep only buffer with at least 3 sequences
infoboldSelectedCol <- data.table::data.table(species_name=infobold$species_name,
fishbase_species_name=infobold$fishbase_species_name,
genus_name=infobold$genus_name,
family_name=infobold$family_name,
order_name=infobold$order_name,
class_name=infobold$class_name,
sequence=infobold$sequence
)
numberOfSequencesByPoints <- number_sequences_by_points(sequenceWithinBuffer)
rls.pts.ids <- as.vector(which(numberOfSequencesByPoints > 2))
## calculate nucdiv for each points
nucDivPts <- lapply(rls.pts.ids,
FUN = function(i) {
species_nucleotide_diversity_pts(i,
infoboldSelectedCol,
sequenceWithinBuffer,
MinimumNumberOfSequencesBySpecies
)
})
nucDivPts.df <- dplyr::bind_rows(nucDivPts)
return(nucDivPts.df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.