#' Removes duplicated accessions of the same species in DNA alignments
#'
#' @author Domingos Cardoso
#'
#' @description This function drops the smaller sequence(s) (with missing characters)
#' for any species duplicated with multiple accessions in the DNA alignment. You
#' can run dropSeq with multiple DNA alignments, but then we recommend to run
#' \code{\link{catmultGenes}} first.
#'
#' @usage
#' dropSeq(\dots,
#' shortaxlabel = FALSE)
#'
#' @param ... one or a list of NEXUS-formatted gene datasets as loaded by ape's
#' \code{\link{read.nexus.data}}, for example. You can also load the resulting list
#' of equally-sized dataframes containing the input gene datasets, as generated by
#' the function \code{\link{catmultGenes}}.
#'
#' @param shortaxlabel Logical, if \code{TRUE} the final individual gene dataset will delete
#' the accession numbers associated with each species or sequence.
#'
#' @return A list of dataframe(s) containing the input DNA alignment(s), where duplicated
#' accessions of the same species are removed.
#'
#' @seealso \code{\link{catmultGenes}}
#' @seealso \code{\link{writeNexus}}
#' @seealso \code{\link{writePhylip}}
#' @seealso \code{\link{nexusdframe}}
#' @seealso \code{\link{phylipdframe}}
#' @seealso \code{\link{fastadframe}}
#'
#' @examples \dontrun{
#' data(Ormosia)
#'
#' # Run dropSeq for one or more individual DNA alignment and then save each
#' # dataset with non-duplicated species using nexusdframe, phylipdframe or fastadframe
#' df <- dropSeq(Ormosia)
#' ITS <- df[[1]]
#' nexusdframe(ITS, file = "filename.nex")
#'
#'
#' # Run function catmultGenes first
#' catdf <- catmultGenes(Ormosia,
#' maxspp = TRUE,
#' shortaxlabel = FALSE,
#' missdata = TRUE)
#'
#' # Run dropSeq for the entire concatenated DNA alignments
#' catdf <- dropSeq(catdf,
#' shortaxlabel = FALSE)
#'
#' # Then save the concatenated DNA alignment
#' writeNexus(catdf,
#' file = "filename.nex",
#' bayesblock = TRUE,
#' interleave = TRUE)
#' }
#'
#' @importFrom dplyr select filter
#' @importFrom tibble add_column
#' @importFrom tidyr unite
#' @importFrom magrittr "%>%"
#' @importFrom stringr str_extract_all
#' @importFrom stats setNames
#'
#' @export
dropSeq <- function(...,
shortaxlabel = FALSE){
datset <- .namedlist(...)
# Removing sequences for a dataset already passed through the catmultGenes
if (names(datset[[1]][[1]])[1] == "species" &
names(datset[[1]][[1]])[2] == "sequence") {
datset <- datset[[1]]
datset_orig <- datset
datset_temp <- datset
for (i in seq_along(datset)) {
datset[[i]] <- tibble::add_column(datset[[i]], full_sp_names = datset[[i]][["species"]], .after = "species")
}
numberdatset <- length(datset)
cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))
if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
nr <- .namesTorename(datset,
cf = cf,
aff = aff,
infraspp = infraspp)
# Adjusting species labels when they have cf. or aff.
# Adjusting species names with infraspecific taxa
datset_temp <- .adjustnames(datset_temp,
cf = cf,
aff = aff,
infraspp = infraspp)
}
# Keeping the species column in just the first dataset.
for (i in 2:numberdatset) {
datset_temp[[i]] <- data.frame(sequences=datset_temp[[i]][,2])
}
ex_datset <- do.call("cbind", datset_temp)
names(ex_datset) <- paste("sequence", 1:length(ex_datset), sep = "")
ex_species <- dplyr::select(ex_datset, 1)
names(ex_species) <- "species"
ex_datset <- dplyr::select(ex_datset, -1)
# Uniting all columns of DNA sequence data.
ex_datset <- tidyr::unite(ex_datset, "sequence", sep = "")
ex_datset$sequence <- as.character(ex_datset$sequence)
datset_temp <- cbind(ex_species, ex_datset)
# Get number of missing data "N" and "?" in each sequence.
missdata <- vector()
missdataN <- vector()
misstotal <- vector()
for (i in seq_along(datset_temp$sequence)){
missdata[i] <- length(stringr::str_extract_all(datset_temp$sequence[i], "[?]", simplify = FALSE)[[1]])
missdataN[i] <- length(stringr::str_extract_all(datset_temp$sequence[i], "N", simplify = FALSE)[[1]])
misstotal[i] <- missdata[i] + missdataN[i]
}
# Adding the total number of missing char in each sequence into a newly
# created column in both the temp dataset and each individual gene dataset.
datset_temp <- tibble::add_column(datset_temp, missites = misstotal, .after = "species")
for (i in seq_along(datset)) {
datset[[i]] <- tibble::add_column(datset[[i]], missites = misstotal, .after = "species")
datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], missites = misstotal, .after = "species")
}
datset_tempB <- datset_temp
datset_tempB$species <- gsub("(_[^_]+)_.*", "\\1", datset_tempB$species)
multispp <- c(duplicated(datset_tempB$species, fromLast = TRUE) |
duplicated(datset_tempB$species))
datset_temp <- tibble::add_column(datset_temp, duplicate = multispp, .after = "species")
# Inserting a temp column with species name without voucher.
datset_temp <- tibble::add_column(datset_temp, species_temp = datset_tempB$species, .after = "species")
for (i in seq_along(datset)) {
datset[[i]] <- tibble::add_column(datset[[i]], duplicate = multispp, .after = "species")
datset[[i]] <- tibble::add_column(datset[[i]], species_temp = datset_tempB$species, .after = "species")
datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], duplicate = multispp, .after = "species")
datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], species_temp = datset_tempB$species, .after = "species")
}
# For each duplicated species with seqs that have the same length, keep just one of them.
# Do this for both the temp gene dataset and then in a loop for each individual gene dataset.
datset_temp <- datset_temp[duplicated(datset_temp[,c("species_temp","missites")],
fromLast = FALSE) == FALSE,]
multispp <- c(duplicated(datset_temp[,"species_temp"], fromLast = TRUE) |
duplicated(datset_temp[,"species_temp"]))
datset_temp$duplicate <- multispp
for (i in seq_along(datset)) {
datset[[i]] <- datset[[i]][duplicated(datset[[i]][,c("species_temp","missites")],
fromLast = FALSE) == FALSE,]
datset[[i]][["duplicate"]] <- multispp
datset_orig[[i]] <- datset_orig[[i]][duplicated(datset_orig[[i]][,c("species_temp","missites")],
fromLast = FALSE) == FALSE,]
datset_orig[[i]][["duplicate"]] <- multispp
}
# Removing the smaller seqs in duplicated species, from both the temp dataset and
# each of the individual gene dataset
for (i in unique(datset_temp$species_temp)) {
tokeep <- vector()
tokeep <- datset_temp$species_temp == i & datset_temp$missites == min(datset_temp$missites[datset_temp$species_temp == i])
datset_temp$duplicate[tokeep] <- FALSE
}
datset_temp <- datset_temp %>% filter(duplicate == FALSE) %>% select(c("species", "sequence"))
cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))
if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
for (j in seq_along(datset)) {
datset[[j]] <- tibble::add_column(datset[[j]], cf_aff_infraspp = NA, .after = "species")
datset[[j]][["cf_aff_infraspp"]][cf[[j]]] <- TRUE
datset[[j]][["cf_aff_infraspp"]][aff[[j]]] <- TRUE
datset[[j]][["cf_aff_infraspp"]][infraspp[[j]]] <- TRUE
datset[[j]][["cf_aff_infraspp"]][is.na(datset[[j]][["cf_aff_infraspp"]])] <- FALSE
datset_orig[[j]] <- tibble::add_column(datset_orig[[j]], cf_aff_infraspp = unlist(datset[[j]][["cf_aff_infraspp"]]), .after = "species")
tokeep <- vector()
for (i in unique(datset[[j]][["species_temp"]])) {
tokeep <- datset[[j]][["species_temp"]] == i & datset[[j]][["missites"]] == min(datset[[j]][["missites"]][datset[[j]][["species_temp"]] == i])
datset[[j]][["duplicate"]][tokeep] <- FALSE
}
datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == T] <- gsub("(_[^_]+_[^_]+)_.*", "\\1", datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == T])
datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == F] <- gsub("(_[^_]+)_.*", "\\1", datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == F])
datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == T] <- gsub("(_[^_]+_[^_]+)_.*", "\\1", datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == T])
datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == F] <- gsub("(_[^_]+)_.*", "\\1", datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == F])
datset[[j]] <- datset[[j]] %>% filter(duplicate == FALSE) %>% select(c("species", "full_sp_names", "sequence"))
datset_orig[[j]] <- datset_orig[[j]] %>% select(c("species", "sequence"))
}
} else {
for (j in seq_along(datset)) {
tokeep <- vector()
for (i in unique(datset[[j]][["species_temp"]])) {
tokeep <- datset[[j]][["species_temp"]] == i & datset[[j]][["missites"]] == min(datset[[j]][["missites"]][datset[[j]][["species_temp"]] == i])
datset[[j]][["duplicate"]][tokeep] <- FALSE
}
datset[[j]] <- datset[[j]] %>% filter(duplicate == FALSE) %>% select(c("species", "full_sp_names", "sequence"))
datset[[j]][["species"]] <- gsub("(_[^_]+)_.*", "\\1", datset[[j]][["species"]])
datset_orig[[j]][["species"]] <- gsub("(_[^_]+)_.*", "\\1", datset_orig[[j]][["species"]])
}
}
############################################################################
# Replacing missing seqs in any gene
missdata <- list()
for (j in seq_along(datset)){
for (i in seq_along(datset[[j]][["sequence"]])){
missdata[[i]] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
}
datset[[j]] <- tibble::add_column(datset[[j]], missites = unlist(missdata), .after = "species")
}
missdata <- list()
for (j in seq_along(datset_orig)) {
for (i in seq_along(datset_orig[[j]][["sequence"]])) {
missdata[[i]] <- length(stringr::str_extract_all(datset_orig[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
}
datset_orig[[j]] <- tibble::add_column(datset_orig[[j]], missites = unlist(missdata), .after = "species")
}
# Droping smaller duplicated seqs and empty seqs
for (i in seq_along(datset_orig)) {
multispp <- list()
multispp[[i]] <- c(duplicated(datset_orig[[i]][,"species"], fromLast = TRUE) |
duplicated(datset_orig[[i]][,"species"]))
datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], duplicate = unlist(multispp), .after = "species")
# Inserting a temp column with species name without voucher
datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], species_temp = datset_orig[[i]][,"species"], .after = "species")
}
# Arranging by species name each dataframe inside the list
datset_orig <- lapply(datset_orig, function(x) x[order(x$species),])
## Deleting duplicated accessions
for (i in seq_along(datset_orig)) {
multispp <- vector()
datset_orig[[i]] <- datset_orig[[i]][duplicated(datset_orig[[i]][,c("species_temp","missites")],
fromLast = FALSE)==F,]
multispp <- c(duplicated(datset_orig[[i]][,"species_temp"], fromLast = TRUE) |
duplicated(datset_orig[[i]][,"species_temp"]))
datset_orig[[i]][["duplicate"]] <- multispp
}
for (j in seq_along(datset_orig)) {
tokeep <- vector()
for (i in unique(datset_orig[[j]][["species_temp"]])) {
tokeep <- datset_orig[[j]][["species_temp"]] == i & datset_orig[[j]][["missites"]] == min(datset_orig[[j]][["missites"]][datset_orig[[j]][["species_temp"]] == i])
datset_orig[[j]][["duplicate"]][tokeep] <- FALSE
}
datset_orig[[j]] <- datset_orig[[j]] %>% filter(duplicate == FALSE)
}
for (j in seq_along(datset_orig)) {
todel <- vector()
for (i in datset_orig[[j]][["species_temp"]]) {
todel <- datset_orig[[j]][["species_temp"]] == i & datset_orig[[j]][["missites"]] == max(datset_orig[[j]][["missites"]])
datset_orig[[j]][["duplicate"]][todel] <- TRUE
}
datset_orig[[j]] <- datset_orig[[j]] %>% filter(duplicate == FALSE)
}
# Inserting the missing seqs
misseqs <- list()
seqs <- list()
misspp <- list()
for (i in seq_along(datset)) {
misseqs[[i]] <- datset[[i]][["species"]][datset[[i]][["missites"]] == max(datset[[i]][["missites"]])]
seqs[[i]] <- datset[[i]][["sequence"]]
misspp[[i]] <- datset_orig[[i]][["species"]][datset_orig[[i]][["species"]] %in% misseqs[[i]]]
seqs[[i]][datset[[i]][["species"]] %in% misspp[[i]]] <-
datset_orig[[i]][["sequence"]][datset_orig[[i]][["species"]] %in% misseqs[[i]]]
datset[[i]][["sequence"]] <- seqs[[i]]
}
for (j in seq_along(datset)) {
for (i in seq_along(datset[[j]][["species"]])) {
if (datset[[j]][["missites"]][i] == nchar(datset[[j]][["sequence"]][1])) {
datset[[j]][["species"]][i] <- datset[[j]][["species"]][i]
} else {
datset[[j]][["species"]][i] <- datset[[j]][["full_sp_names"]][i]
}
}
datset[[j]] <- datset[[j]] %>% select(c("species", "sequence"))
if (shortaxlabel) {
g <- grepl("[[:lower:]]+_[[:lower:]]+_[[:lower:]]+|[[:upper:]]+_[[:lower:]]+_[[:lower:]]+", datset[[j]][["species"]])
datset[[j]][["species"]][g] <- gsub("(_[^_]+_[^_]+)_.*", "\\1", datset[[j]][["species"]][g])
datset[[j]][["species"]][!g] <- gsub("(_[^_]+)_.*", "\\1", datset[[j]][["species"]][!g])
}
}
} else {
# Droping duplicated sequences for one or a list of DNA alignments with difering
# number of sequences. This code below will work despite having run the function catmultGenes
numberdatset <- length(datset)
if (numberdatset == 0) {
stop("You must provide at least ONE gene dataset in the following format:
datset=gene1, gene2, gene3... or a list of genes in a single vector
Find help also at DBOSLab-UFBA
(Domingos Cardoso; cardosobot@gmail.com)")
}
if (numberdatset == 1 & !any(lapply(datset[[1]], class) == "list")) {
temp_name <- names(datset)
datset <- datset[[1]]
for (i in 1:length(datset)) {
datset[[i]] <- paste(datset[[i]], collapse = "")
datset[[i]] <- toupper(datset[[i]])
}
spp <- stats::setNames(data.frame(names(datset)), "species")
seqs <- stats::setNames(data.frame(unlist(datset, use.names = FALSE)), "sequence")
datset <- cbind(spp, seqs)
datset <- list(datset)
names(datset) <- temp_name
} else {
# Tranforming the original matrix as read by ape into list of dataframes
datset <- datset[[1]]
cf <- lapply(datset, function(x) grepl("_cf_", names(x)))
aff <- lapply(datset, function(x) grepl("_aff_", names(x)))
spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", names(x)))
infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+",
x))
if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
nr <- .namesTorename(datset,
cf = cf,
aff = aff,
infraspp = infraspp)
# Adjusting species labels when they have cf or aff
# Adjusting species names with infraspecific taxa
datset <- .adjustnames(datset,
cf = cf,
aff = aff,
infraspp = infraspp)
}
temp_name <- names(datset)
for (j in seq_along(datset)) {
for (i in 1:length(datset[[j]])) {
datset[[j]][[i]] <- paste(datset[[j]][[i]], collapse = "")
datset[[j]][[i]] <- toupper(datset[[j]][[i]])
}
spp <- stats::setNames(data.frame(names(datset[[j]])), "species")
seqs <- stats::setNames(data.frame(unlist(datset[[j]], use.names = FALSE)), "sequence")
datset[[j]] <- cbind(spp, seqs)
datset[[j]] <- list(datset[[j]])
}
# Extracting the dataframes inside a list of list
datset_temp <- list()
for (i in seq_along(datset)) {
datset_temp[[i]] <- datset[[i]][[1]]
}
datset <- datset_temp
names(datset) <- temp_name
}
# Get number of missing data "N" and "?" in each sequence
missdata <- list()
missdataN <- list()
#misstotal <- list()
for (j in seq_along(datset)) {
misstotal_temp <- list()
for (i in seq_along(datset[[j]][["sequence"]])) {
missdata[i] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
missdataN[i] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "N", simplify = FALSE)[[1]])
misstotal_temp[i] <- missdata[[i]] + missdataN[[i]]
}
# Adding the total number of missing char in each sequence into a newly created column
# misstotal[[j]] <- unlist(misstotal_temp)
datset[[j]] <- tibble::add_column(datset[[j]], missites = unlist(misstotal_temp), .after = "species")
}
# Applying the function shortaxlabels to each dataframe inside the list
datset_temp <- lapply(datset, .shortaxlabels)
for (i in seq_along(datset)) {
multispp <- list()
multispp[[i]] <- c(duplicated(datset_temp[[i]][,"species"], fromLast = TRUE) |
duplicated(datset_temp[[i]][,"species"]))
datset[[i]] <- tibble::add_column(datset[[i]], duplicate = unlist(multispp), .after = "species")
# Inserting a temp column with species name without voucher
datset[[i]] <- tibble::add_column(datset[[i]], species_temp = datset_temp[[i]][,"species"], .after = "species")
}
# Arranging by species name each dataframe inside the list
datset <- lapply(datset, function(x) x[order(x$species),])
#datset <- lapply(datset, function(x) dplyr::arrange(x, x[,1]))
## Deleting duplicate accessions when there is only one gene dataset
# Get first how many sequences are there in each dataset
nseqs <- as.vector(sapply(datset, nrow))
if (length(datset) == 1 | length(datset) > 1 & !equalnumb(nseqs)) {
cat("For each species duplicated with multiple accessions...", sep = "\n")
cat("Smaller sequences (with more missing data) were dropped.", sep = "\n")
if (length(datset) > 1 & !equalnumb(nseqs)) {
cat("NOTE: You have entered more than one DNA alignment, each with differing number of sequences or taxa!", sep = "\n")
cat("Consider running the concatenating function catmultGenes first, before using dropSeq", sep = "\n")
}
for (i in seq_along(datset)) {
multispp <- vector()
datset[[i]] <- datset[[i]][duplicated(datset[[i]][,c("species_temp","missites")],
fromLast = FALSE)==F,]
multispp <- c(duplicated(datset[[i]][,"species_temp"], fromLast = TRUE) |
duplicated(datset[[i]][,"species_temp"]))
datset[[i]][["duplicate"]] <- multispp
}
for (j in seq_along(datset)) {
tokeep <- vector()
for (i in unique(datset[[j]][["species_temp"]])) {
tokeep <- datset[[j]][["species_temp"]] == i & datset[[j]][["missites"]] == min(datset[[j]][["missites"]][datset[[j]][["species_temp"]] == i])
datset[[j]][["duplicate"]][tokeep] <- FALSE
}
datset[[j]] <- datset[[j]] %>% filter(duplicate == FALSE) %>% select(c("species", "sequence"))
}
}
if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
# Putting back the names under cf. and aff.
# Adjusting names with infraspecific taxa
datset <- .namesback(datset,
cf = cf,
aff = aff,
infraspp = infraspp,
rename_cf = nr[["rename_cf"]],
rename_aff = nr[["rename_aff"]],
rename_infraspp = nr[["rename_infraspp"]],
shortaxlabel = TRUE,
multispp = TRUE)
}
}
return(datset)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.