R/nucleotide_diversity.R

Defines functions species_nucleotide_diversity number_sequences_by_points species_nucleotide_diversity_pts nuc_diversity_alignment align_sequence

Documented in align_sequence nuc_diversity_alignment number_sequences_by_points species_nucleotide_diversity species_nucleotide_diversity_pts

library(Biostrings)
#library(muscle)
library(tidyverse)
library(ape)
library(pegas)
library(future.apply)
library(data.table)

#' function to align sequences with MUSCLE
#' @export
align_sequence <- function(sequences, MinimumNumberOfSequencesBySpecies) {
  seqStringSet <- Biostrings::DNAStringSet(as.vector(sequences))
  ## check if at least 2 sequences 
  if(length(seqStringSet@ranges) < MinimumNumberOfSequencesBySpecies) {
    return(NA)
  } else {
    #alignement <- muscle::muscle(seqStringSet, quiet=TRUE)
    seqStringSetNoGap <- DECIPHER::RemoveGaps(seqStringSet,
                                    removeGaps = "all",
                                    processors = 1)
    alignement <- DECIPHER::AlignSeqs(seqStringSetNoGap, verbose= FALSE)
    
    #alignement <- msa::msaMuscle(seqStringSet)
    return(alignement)
  }
}

#' function to calculate div nuc
#' @export
nuc_diversity_alignment <- function(alignment) {
  if(typeof(alignment) != "logical") {
    return(pegas::nuc.div(ape::as.DNAbin(alignment)))
  } else {
    return(NA)
  }
}


#' nucleotide diversity by species
#' arguments:
#' * infobold a dataframe with taxonomy+sequence dna information
#' * buffer.Within a dataframe with row as indiv seq and col as rls pts
#' with TRUE/FALSE presence/absence id of sequence infobold within a given buffer around a given rls pts
#' * rls.pts.index index of the RLS pts
#' it calculates 
#' * alignment of sequences by species
#' * number of sequences by species
#' * nucleotide diversity
#' it returns
#' a dataframe with taxonomy and nucleotide diversity and number of sequences by species
#' species with not enough sequences to calculate nucleotide diversity are removed
#' @export
species_nucleotide_diversity_pts <- function(rls.pts.index, infobold, boldWithinRLS, MinimumNumberOfSequencesBySpecies) {
  seqboldwithin <- infobold[which(boldWithinRLS[, rls.pts.index] == TRUE),]
  seqboldwithinGroupSpecies <- seqboldwithin %>% dplyr::group_by(species_name)
  seqboldwithinUniqSpecies <- seqboldwithinGroupSpecies %>% dplyr::slice(1) %>% dplyr::ungroup()
  numberSequencesAli <- seqboldwithinGroupSpecies %>% dplyr::tally()
  groupSeq <- seqboldwithinGroupSpecies %>% dplyr::group_map(~Biostrings::DNAStringSet(as.vector(.x$sequence)))
  alignSeq <- purrr::map(groupSeq, ~ align_sequence(.x, MinimumNumberOfSequencesBySpecies))
  nucDivAll <- alignSeq %>% purrr::map(nuc_diversity_alignment) %>% unlist()
  seqboldwNucDiv <- data.table::data.table(species_name=seqboldwithinUniqSpecies$species_name,
                               fishbase_species_name=seqboldwithinUniqSpecies$fishbase_species_name,
                               genus_name=seqboldwithinUniqSpecies$genus_name,
                               family_name=seqboldwithinUniqSpecies$family_name,
                               order_name=seqboldwithinUniqSpecies$order_name,
                               class_name=seqboldwithinUniqSpecies$class_name,
                               nucDiv=nucDivAll,
                               number_individuals=numberSequencesAli$n,
                               rls.pts.index=rep(rls.pts.index,length(nucDivAll))
                              )
  seqboldwNucDiv <- seqboldwNucDiv[which(!is.na(seqboldwNucDiv$nucDiv)),]
  print("done")
  ## clear
  
  print(rls.pts.index)
  return(seqboldwNucDiv)
}


#' number of bold sequence into a buffer around a RLS point
#' @export
number_sequences_by_points <- function(sequenceWithinBuffer) {
  numberOfSequencesByPoints <- data.frame(apply(sequenceWithinBuffer, 2, function(x) length(which(x))))
  names(numberOfSequencesByPoints) <- "numberOfSequencesPoints"
  return(numberOfSequencesByPoints)
}

#' nucleotide diversity by species applied to all RLS pts
#' @export
species_nucleotide_diversity <- function(infobold, sequenceWithinBuffer, MinimumNumberOfSequencesBySpecies) {
  ## keep only buffer with at least 3 sequences
  infoboldSelectedCol <- data.table::data.table(species_name=infobold$species_name,
                                                fishbase_species_name=infobold$fishbase_species_name,
                                                genus_name=infobold$genus_name,
                                                family_name=infobold$family_name,
                                                order_name=infobold$order_name,
                                                class_name=infobold$class_name,
                                                sequence=infobold$sequence
                                                )
  numberOfSequencesByPoints <- number_sequences_by_points(sequenceWithinBuffer)
  rls.pts.ids <- as.vector(which(numberOfSequencesByPoints > 2))
  ## calculate nucdiv for each points
  nucDivPts <- lapply(rls.pts.ids,
                                           FUN = function(i) {
                                             species_nucleotide_diversity_pts(i,
                                                                              infoboldSelectedCol,
                                                                              sequenceWithinBuffer,
                                                                              MinimumNumberOfSequencesBySpecies
                                             )
                                           })
  
  
  nucDivPts.df <- dplyr::bind_rows(nucDivPts)
  return(nucDivPts.df)
}
Grelot/geogendivr documentation built on Sept. 3, 2020, 6:25 p.m.