R/dropSeq.R

Defines functions dropSeq

Documented in dropSeq

#' Removes duplicated accessions of the same species in DNA alignments
#'
#' @author Domingos Cardoso
#'
#' @description This function drops the smaller sequence(s) (with missing characters)
#' for any species duplicated with multiple accessions in the DNA alignment. You
#' can run dropSeq with multiple DNA alignments, but then we recommend to run
#' \code{\link{catmultGenes}} first.
#'
#' @usage
#' dropSeq(\dots,
#'         shortaxlabel = FALSE)
#'
#' @param ... one or a list of NEXUS-formatted gene datasets as loaded by ape's
#' \code{\link{read.nexus.data}}, for example. You can also load the resulting list
#' of equally-sized dataframes containing the input gene datasets, as generated by
#' the function \code{\link{catmultGenes}}.
#'
#' @param shortaxlabel Logical, if \code{TRUE} the final individual gene dataset will delete
#' the accession numbers associated with each species or sequence.
#'
#' @return A list of dataframe(s) containing the input DNA alignment(s), where duplicated
#' accessions of the same species are removed.
#'
#' @seealso \code{\link{catmultGenes}}
#' @seealso \code{\link{writeNexus}}
#' @seealso \code{\link{writePhylip}}
#' @seealso \code{\link{nexusdframe}}
#' @seealso \code{\link{phylipdframe}}
#' @seealso \code{\link{fastadframe}}
#'
#' @examples \dontrun{
#' data(Ormosia)
#'
#' # Run dropSeq for one or more individual DNA alignment and then save each
#' # dataset with non-duplicated species using nexusdframe, phylipdframe or fastadframe
#' df <- dropSeq(Ormosia)
#' ITS <- df[[1]]
#' nexusdframe(ITS, file = "filename.nex")
#'
#'
#' # Run function catmultGenes first
#' catdf <- catmultGenes(Ormosia,
#'                       maxspp = TRUE,
#'                       shortaxlabel = FALSE,
#'                       missdata = TRUE)
#'
#' # Run dropSeq for the entire concatenated DNA alignments
#' catdf <- dropSeq(catdf,
#'                  shortaxlabel = FALSE)
#'
#' # Then save the concatenated DNA alignment
#' writeNexus(catdf,
#'            file = "filename.nex",
#'            bayesblock = TRUE,
#'            interleave = TRUE)
#' }
#'
#' @importFrom dplyr select filter
#' @importFrom tibble add_column
#' @importFrom tidyr unite
#' @importFrom magrittr "%>%"
#' @importFrom stringr str_extract_all
#' @importFrom stats setNames
#'
#' @export

dropSeq <- function(...,
                    shortaxlabel = FALSE){

  datset <- .namedlist(...)

  # Removing sequences for a dataset already passed through the catmultGenes
  if (names(datset[[1]][[1]])[1] == "species" &
      names(datset[[1]][[1]])[2] == "sequence") {

    datset <- datset[[1]]
    datset_orig <- datset
    datset_temp <- datset
    for (i in seq_along(datset)) {
      datset[[i]] <- tibble::add_column(datset[[i]], full_sp_names = datset[[i]][["species"]], .after = "species")
    }
    numberdatset <- length(datset)

    cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
    aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
    spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
    infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))

    if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {

      nr <- .namesTorename(datset,
                           cf = cf,
                           aff = aff,
                           infraspp = infraspp)

      # Adjusting species labels when they have cf. or aff.
      # Adjusting species names with infraspecific taxa
      datset_temp <- .adjustnames(datset_temp,
                                  cf = cf,
                                  aff = aff,
                                  infraspp = infraspp)
    }


    # Keeping the species column in just the first dataset.
    for (i in 2:numberdatset) {
      datset_temp[[i]] <- data.frame(sequences=datset_temp[[i]][,2])
    }

    ex_datset <- do.call("cbind", datset_temp)
    names(ex_datset) <- paste("sequence", 1:length(ex_datset), sep = "")
    ex_species <- dplyr::select(ex_datset, 1)
    names(ex_species) <- "species"

    ex_datset <- dplyr::select(ex_datset, -1)
    # Uniting all columns of DNA sequence data.
    ex_datset <- tidyr::unite(ex_datset, "sequence", sep = "")
    ex_datset$sequence <- as.character(ex_datset$sequence)

    datset_temp <- cbind(ex_species, ex_datset)

    # Get number of missing data "N" and "?" in each sequence.
    missdata <- vector()
    missdataN <- vector()
    misstotal <- vector()
    for (i in seq_along(datset_temp$sequence)){
      missdata[i] <- length(stringr::str_extract_all(datset_temp$sequence[i], "[?]", simplify = FALSE)[[1]])
      missdataN[i] <- length(stringr::str_extract_all(datset_temp$sequence[i], "N", simplify = FALSE)[[1]])
      misstotal[i] <- missdata[i] + missdataN[i]
    }
    # Adding the total number of missing char in each sequence into a newly
    # created column in both the temp dataset and each individual gene dataset.
    datset_temp <- tibble::add_column(datset_temp, missites = misstotal, .after = "species")
    for (i in seq_along(datset)) {
      datset[[i]] <- tibble::add_column(datset[[i]], missites = misstotal, .after = "species")
      datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], missites = misstotal, .after = "species")
    }

    datset_tempB <- datset_temp
    datset_tempB$species <- gsub("(_[^_]+)_.*", "\\1", datset_tempB$species)

    multispp <- c(duplicated(datset_tempB$species, fromLast = TRUE) |
                    duplicated(datset_tempB$species))
    datset_temp <- tibble::add_column(datset_temp, duplicate = multispp, .after = "species")
    # Inserting a temp column with species name without voucher.
    datset_temp <- tibble::add_column(datset_temp, species_temp = datset_tempB$species, .after = "species")
    for (i in seq_along(datset)) {
      datset[[i]] <- tibble::add_column(datset[[i]], duplicate = multispp, .after = "species")
      datset[[i]] <- tibble::add_column(datset[[i]], species_temp = datset_tempB$species, .after = "species")
      datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], duplicate = multispp, .after = "species")
      datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], species_temp = datset_tempB$species, .after = "species")
    }

    # For each duplicated species with seqs that have the same length, keep just one of them.
    # Do this for both the temp gene dataset and then in a loop for each individual gene dataset.
    datset_temp <- datset_temp[duplicated(datset_temp[,c("species_temp","missites")],
                                          fromLast = FALSE) == FALSE,]
    multispp <- c(duplicated(datset_temp[,"species_temp"], fromLast = TRUE) |
                    duplicated(datset_temp[,"species_temp"]))
    datset_temp$duplicate <- multispp

    for (i in seq_along(datset)) {
      datset[[i]] <- datset[[i]][duplicated(datset[[i]][,c("species_temp","missites")],
                                            fromLast = FALSE) == FALSE,]
      datset[[i]][["duplicate"]] <- multispp
      datset_orig[[i]] <- datset_orig[[i]][duplicated(datset_orig[[i]][,c("species_temp","missites")],
                                            fromLast = FALSE) == FALSE,]
      datset_orig[[i]][["duplicate"]] <- multispp
    }

    # Removing the smaller seqs in duplicated species, from both the temp dataset and
    # each of the individual gene dataset
    for (i in unique(datset_temp$species_temp)) {
      tokeep <- vector()
      tokeep <- datset_temp$species_temp == i & datset_temp$missites == min(datset_temp$missites[datset_temp$species_temp == i])
      datset_temp$duplicate[tokeep] <- FALSE
    }
    datset_temp <- datset_temp %>% filter(duplicate == FALSE) %>% select(c("species", "sequence"))


    cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
    aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
    spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
    infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))

    if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
      for (j in seq_along(datset)) {

        datset[[j]] <- tibble::add_column(datset[[j]], cf_aff_infraspp = NA, .after = "species")
        datset[[j]][["cf_aff_infraspp"]][cf[[j]]] <- TRUE
        datset[[j]][["cf_aff_infraspp"]][aff[[j]]] <- TRUE
        datset[[j]][["cf_aff_infraspp"]][infraspp[[j]]] <- TRUE
        datset[[j]][["cf_aff_infraspp"]][is.na(datset[[j]][["cf_aff_infraspp"]])] <- FALSE
        datset_orig[[j]] <- tibble::add_column(datset_orig[[j]], cf_aff_infraspp = unlist(datset[[j]][["cf_aff_infraspp"]]), .after = "species")
        tokeep <- vector()
        for (i in unique(datset[[j]][["species_temp"]])) {

          tokeep <- datset[[j]][["species_temp"]] == i & datset[[j]][["missites"]] == min(datset[[j]][["missites"]][datset[[j]][["species_temp"]] == i])
          datset[[j]][["duplicate"]][tokeep] <- FALSE
        }
        datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == T] <- gsub("(_[^_]+_[^_]+)_.*", "\\1", datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == T])
        datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == F] <- gsub("(_[^_]+)_.*", "\\1", datset[[j]][["species"]][datset[[j]][["cf_aff_infraspp"]] == F])
        datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == T] <- gsub("(_[^_]+_[^_]+)_.*", "\\1", datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == T])
        datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == F] <- gsub("(_[^_]+)_.*", "\\1", datset_orig[[j]][["species"]][datset_orig[[j]][["cf_aff_infraspp"]] == F])

        datset[[j]] <- datset[[j]] %>% filter(duplicate == FALSE) %>% select(c("species", "full_sp_names", "sequence"))
        datset_orig[[j]] <- datset_orig[[j]] %>% select(c("species", "sequence"))
      }

    } else {

      for (j in seq_along(datset)) {
        tokeep <- vector()
        for (i in unique(datset[[j]][["species_temp"]])) {
          tokeep <- datset[[j]][["species_temp"]] == i & datset[[j]][["missites"]] == min(datset[[j]][["missites"]][datset[[j]][["species_temp"]] == i])
          datset[[j]][["duplicate"]][tokeep] <- FALSE
        }
        datset[[j]] <- datset[[j]] %>% filter(duplicate == FALSE) %>% select(c("species", "full_sp_names", "sequence"))


        datset[[j]][["species"]] <- gsub("(_[^_]+)_.*", "\\1", datset[[j]][["species"]])

        datset_orig[[j]][["species"]] <- gsub("(_[^_]+)_.*", "\\1", datset_orig[[j]][["species"]])
      }
    }


    ############################################################################
    # Replacing missing seqs in any gene
    missdata <- list()
    for (j in seq_along(datset)){
      for (i in seq_along(datset[[j]][["sequence"]])){
        missdata[[i]] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
      }
      datset[[j]] <- tibble::add_column(datset[[j]], missites = unlist(missdata), .after = "species")
    }

    missdata <- list()
    for (j in seq_along(datset_orig)) {
      for (i in seq_along(datset_orig[[j]][["sequence"]])) {
        missdata[[i]] <- length(stringr::str_extract_all(datset_orig[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
      }
      datset_orig[[j]] <- tibble::add_column(datset_orig[[j]], missites = unlist(missdata), .after = "species")
    }

    # Droping smaller duplicated seqs and empty seqs
    for (i in seq_along(datset_orig)) {
      multispp <- list()
      multispp[[i]] <- c(duplicated(datset_orig[[i]][,"species"], fromLast = TRUE) |
                           duplicated(datset_orig[[i]][,"species"]))
      datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], duplicate = unlist(multispp), .after = "species")
      # Inserting a temp column with species name without voucher
      datset_orig[[i]] <- tibble::add_column(datset_orig[[i]], species_temp = datset_orig[[i]][,"species"], .after = "species")
    }

    # Arranging by species name each dataframe inside the list
    datset_orig <- lapply(datset_orig, function(x) x[order(x$species),])


    ## Deleting duplicated accessions
    for (i in seq_along(datset_orig)) {
      multispp <- vector()
      datset_orig[[i]] <- datset_orig[[i]][duplicated(datset_orig[[i]][,c("species_temp","missites")],
                                                      fromLast = FALSE)==F,]
      multispp <- c(duplicated(datset_orig[[i]][,"species_temp"], fromLast = TRUE) |
                      duplicated(datset_orig[[i]][,"species_temp"]))
      datset_orig[[i]][["duplicate"]] <- multispp
    }

    for (j in seq_along(datset_orig)) {
      tokeep <- vector()
      for (i in unique(datset_orig[[j]][["species_temp"]])) {
        tokeep <- datset_orig[[j]][["species_temp"]] == i & datset_orig[[j]][["missites"]] == min(datset_orig[[j]][["missites"]][datset_orig[[j]][["species_temp"]] == i])
        datset_orig[[j]][["duplicate"]][tokeep] <- FALSE
      }
      datset_orig[[j]] <- datset_orig[[j]] %>% filter(duplicate == FALSE)
    }
    for (j in seq_along(datset_orig)) {
      todel <- vector()
      for (i in datset_orig[[j]][["species_temp"]]) {
        todel <- datset_orig[[j]][["species_temp"]] == i & datset_orig[[j]][["missites"]] == max(datset_orig[[j]][["missites"]])
        datset_orig[[j]][["duplicate"]][todel] <- TRUE
      }

      datset_orig[[j]] <- datset_orig[[j]] %>% filter(duplicate == FALSE)
    }

    # Inserting the missing seqs
    misseqs <- list()
    seqs <- list()
    misspp <- list()
    for (i in seq_along(datset)) {
      misseqs[[i]] <- datset[[i]][["species"]][datset[[i]][["missites"]] == max(datset[[i]][["missites"]])]
      seqs[[i]]  <- datset[[i]][["sequence"]]
      misspp[[i]] <- datset_orig[[i]][["species"]][datset_orig[[i]][["species"]] %in% misseqs[[i]]]

      seqs[[i]][datset[[i]][["species"]] %in% misspp[[i]]] <-
        datset_orig[[i]][["sequence"]][datset_orig[[i]][["species"]] %in% misseqs[[i]]]

      datset[[i]][["sequence"]] <- seqs[[i]]

    }

    for (j in seq_along(datset)) {
      for (i in seq_along(datset[[j]][["species"]])) {
        if (datset[[j]][["missites"]][i] == nchar(datset[[j]][["sequence"]][1])) {
          datset[[j]][["species"]][i] <- datset[[j]][["species"]][i]
        } else {
          datset[[j]][["species"]][i] <- datset[[j]][["full_sp_names"]][i]
        }
      }
      datset[[j]] <- datset[[j]] %>% select(c("species", "sequence"))

      if (shortaxlabel) {
        g <- grepl("[[:lower:]]+_[[:lower:]]+_[[:lower:]]+|[[:upper:]]+_[[:lower:]]+_[[:lower:]]+", datset[[j]][["species"]])
        datset[[j]][["species"]][g] <- gsub("(_[^_]+_[^_]+)_.*", "\\1", datset[[j]][["species"]][g])
        datset[[j]][["species"]][!g] <- gsub("(_[^_]+)_.*", "\\1", datset[[j]][["species"]][!g])
      }

    }


  } else {

    # Droping duplicated sequences for one or a list of DNA alignments with difering
    # number of sequences. This code below will work despite having run the function catmultGenes

    numberdatset <- length(datset)

    if (numberdatset == 0) {
      stop("You must provide at least ONE gene dataset in the following format:
         datset=gene1, gene2, gene3... or a list of genes in a single vector
         Find help also at DBOSLab-UFBA
         (Domingos Cardoso; cardosobot@gmail.com)")
    }

    if (numberdatset == 1 & !any(lapply(datset[[1]], class) == "list")) {
      temp_name <- names(datset)
      datset <- datset[[1]]
      for (i in 1:length(datset)) {
        datset[[i]] <- paste(datset[[i]], collapse = "")
        datset[[i]] <- toupper(datset[[i]])
      }
      spp <- stats::setNames(data.frame(names(datset)), "species")
      seqs <- stats::setNames(data.frame(unlist(datset, use.names = FALSE)), "sequence")
      datset <- cbind(spp, seqs)
      datset <- list(datset)
      names(datset) <- temp_name

    } else {
      # Tranforming the original matrix as read by ape into list of dataframes
      datset <- datset[[1]]

      cf <- lapply(datset, function(x) grepl("_cf_", names(x)))
      aff <- lapply(datset, function(x) grepl("_aff_", names(x)))
      spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", names(x)))
      infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+",
                                                     x))

      if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {

        nr <- .namesTorename(datset,
                             cf = cf,
                             aff = aff,
                             infraspp = infraspp)

        # Adjusting species labels when they have cf or aff
        # Adjusting species names with infraspecific taxa
        datset <- .adjustnames(datset,
                               cf = cf,
                               aff = aff,
                               infraspp = infraspp)
      }

      temp_name <- names(datset)
      for (j in seq_along(datset)) {

        for (i in 1:length(datset[[j]])) {
          datset[[j]][[i]] <- paste(datset[[j]][[i]], collapse = "")
          datset[[j]][[i]] <- toupper(datset[[j]][[i]])
        }
        spp <- stats::setNames(data.frame(names(datset[[j]])), "species")
        seqs <- stats::setNames(data.frame(unlist(datset[[j]], use.names = FALSE)), "sequence")
        datset[[j]] <- cbind(spp, seqs)
        datset[[j]] <- list(datset[[j]])

      }
      # Extracting the dataframes inside a list of list
      datset_temp <- list()
      for (i in seq_along(datset)) {
        datset_temp[[i]] <- datset[[i]][[1]]
      }
      datset <- datset_temp
      names(datset) <- temp_name
    }

    # Get number of missing data "N" and "?" in each sequence
    missdata <- list()
    missdataN <- list()
    #misstotal <- list()
    for (j in seq_along(datset)) {
      misstotal_temp <- list()
      for (i in seq_along(datset[[j]][["sequence"]])) {
        missdata[i] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
        missdataN[i] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "N", simplify = FALSE)[[1]])
        misstotal_temp[i] <- missdata[[i]] + missdataN[[i]]
      }
      # Adding the total number of missing char in each sequence into a newly created column
      # misstotal[[j]] <- unlist(misstotal_temp)
      datset[[j]] <- tibble::add_column(datset[[j]], missites = unlist(misstotal_temp), .after = "species")
    }

    # Applying the function shortaxlabels to each dataframe inside the list
    datset_temp <- lapply(datset, .shortaxlabels)

    for (i in seq_along(datset)) {
      multispp <- list()
      multispp[[i]] <- c(duplicated(datset_temp[[i]][,"species"], fromLast = TRUE) |
                           duplicated(datset_temp[[i]][,"species"]))
      datset[[i]] <- tibble::add_column(datset[[i]], duplicate = unlist(multispp), .after = "species")
      # Inserting a temp column with species name without voucher
      datset[[i]] <- tibble::add_column(datset[[i]], species_temp = datset_temp[[i]][,"species"], .after = "species")
    }

    # Arranging by species name each dataframe inside the list
    datset <- lapply(datset, function(x) x[order(x$species),])
    #datset <- lapply(datset, function(x) dplyr::arrange(x, x[,1]))


    ## Deleting duplicate accessions when there is only one gene dataset
    # Get first how many sequences are there in each dataset
    nseqs <- as.vector(sapply(datset, nrow))
    if (length(datset) == 1 | length(datset) > 1 & !equalnumb(nseqs)) {

      cat("For each species duplicated with multiple accessions...", sep = "\n")
      cat("Smaller sequences (with more missing data) were dropped.", sep = "\n")

      if (length(datset) > 1 & !equalnumb(nseqs)) {
        cat("NOTE: You have entered more than one DNA alignment, each with differing number of sequences or taxa!", sep = "\n")
        cat("Consider running the concatenating function catmultGenes first, before using dropSeq", sep = "\n")
      }

      for (i in seq_along(datset)) {
        multispp <- vector()
        datset[[i]] <- datset[[i]][duplicated(datset[[i]][,c("species_temp","missites")],
                                              fromLast = FALSE)==F,]
        multispp <- c(duplicated(datset[[i]][,"species_temp"], fromLast = TRUE) |
                        duplicated(datset[[i]][,"species_temp"]))
        datset[[i]][["duplicate"]] <- multispp
      }

      for (j in seq_along(datset)) {
        tokeep <- vector()
        for (i in unique(datset[[j]][["species_temp"]])) {
          tokeep <- datset[[j]][["species_temp"]] == i & datset[[j]][["missites"]] == min(datset[[j]][["missites"]][datset[[j]][["species_temp"]] == i])
          datset[[j]][["duplicate"]][tokeep] <- FALSE
        }

        datset[[j]] <- datset[[j]] %>% filter(duplicate == FALSE) %>% select(c("species", "sequence"))
      }
    }

    if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
      # Putting back the names under cf. and aff.
      # Adjusting names with infraspecific taxa
      datset <- .namesback(datset,
                           cf = cf,
                           aff = aff,
                           infraspp = infraspp,
                           rename_cf = nr[["rename_cf"]],
                           rename_aff = nr[["rename_aff"]],
                           rename_infraspp = nr[["rename_infraspp"]],
                           shortaxlabel = TRUE,
                           multispp = TRUE)

    }
  }

  return(datset)
}
domingoscardoso/catGenes documentation built on March 14, 2024, 9:21 p.m.