R/writePhylip.R

Defines functions writePhylip

Documented in writePhylip

#' Writes a PHYLIP-formatted file of fully concatenated gene datasets and an
#' associated partition file for RAxML concatenated phylogenetic analysis
#'
#' @author Domingos Cardoso, Quezia Cavalcante, and Bruno Vilela
#'
#' @description From the resulting list of equally-sized dataframes containing the
#' input individual DNA alignments, as generated by the functions \code{\link{catfullGenes}}
#' or \code{\link{catmultGenes}}, it will write (i) a final PHYLIP-formatted file
#' of fully concatenated dataset and (ii) an associated partition file for
#' the RAxML concatenated phylogenetic analysis using a mixed/partitioned model.
#'
#' @usage
#' writePhylip(x,
#'            file,
#'            genomics = FALSE,
#'            catalignments = TRUE,
#'            partitionfile = TRUE)
#'
#' @param x The object to be written, a list of equally-sized dataframes containing
#' the input gene datasets, as generated by the functions \\code{\link{catfullGenes}}
#' or \code{\link{catmultGenes}}.
#'
#' @param file Either a character string naming a file or a \code{\link{connection}}
#'  open for writing.
#'
#' @param genomics Logical, if \code{TRUE}, it will be helpful in phylogenomics
#' where all provided species identifiers (e.g. collector number and GenBank
#' acession numbers) will always be kept in the resultant concatenated dataset.
#'
#' @param catalignments Logical, if \code{FALSE} will not write the concatednated
#' matrix of DNA alinments. Better to keep it always as \code{TRUE}.
#'
#' @param partitionfile Logical, if \code{FALSE} will not write the associated
#' text-formatted partition file for the RAxML concatenated phylogenetic analysis
#' using a mixed/partitioned model.
#'
#' @param endgaps.to.miss Logical, if \code{FALSE} the function will not replace
#' terminal GAPs into missing character (?).
#'
#' @seealso \code{\link{catfullGenes}}
#' @seealso \code{\link{catmultGenes}}
#'
#' @examples \dontrun{
#' data(Gaya)
#' df <- catfullGenes(Gaya,
#'                    shortaxlabel = TRUE,
#'                    missdata = FALSE,
#'                    outgroup = "Abutilon_costicalyx")
#'
#' writePhylip(df,
#'             file = "filename.phy",
#'             genomics = FALSE,
#'             catalignments = TRUE,
#'             partitionfile = TRUE)
#' }
#'
#' @importFrom dplyr select
#' @importFrom magrittr "%>%"
#' @importFrom R.utils insert
#' @importFrom stringr str_extract_all
#' @importFrom tibble add_column
#' @importFrom tidyr unite
#' @importFrom utils write.table
#'
#' @export

writePhylip <- function(x, file,
                        genomics = FALSE,
                        catalignments = TRUE,
                        partitionfile = TRUE,
                        endgaps.to.miss = TRUE) {

  datset <- .namedlist(x)

  numberinputdatset <- length(datset)

  # Unlist and rename an input list from genefullcomp function
  if (numberinputdatset == 1) {
    datset <- unlist(datset, recursive = FALSE)
    names(datset) <- gsub(".*[.]", "", names(datset))

    for (i in 1:length(datset)) {
      datset[[i]]$species <- as.character(datset[[i]]$species)
      datset[[i]]$sequence <- as.character(datset[[i]]$sequence)}
  }

  numberdatset <- length(datset)

  if (numberdatset == 1) {
    stop("You must provide a file generated by the genefullcomp
          Find help also at DBOSLab-UFBA (Domingos Cardoso; cardosobot@gmail.com)")
  }

  genenames <- names(datset)

  cat("You are combining the following gene datasets:", "", sep = "\n")
  cat(cat(genenames, sep = ", "), "", sep = "\n")
  cat(cat("Total number of datasets:", numberdatset), "", sep = "\n")

  numbchar <- lapply(datset, function(x) nchar(x[1, 2]))
  series_numbchar <- paste(lapply(numbchar, function(x) unlist(x)))

  # Making all separate dataset within the list the same size to transforme into a dataframe with NAs whenever there is no sequence
  #datasets <- lapply(datasets, `length<-`, max(lengths(datasets)))

  ntax <- paste0(length(rownames(datset[[1]])))
  nchartotal <- sum(unlist(numbchar))

  dimensions <- paste(ntax, nchartotal, sep = " ")

  cat("Your phylip-formatted concantenated dataset will have the following DIMENSIONS:", "", sep = "\n")
  cat(paste(ntax, "TAXA"), sep = "\n")
  cat(paste(nchartotal, "CHARACTERS"), sep = "\n")

  # Finding identifiers that differ among the sequences and across genes
  datset_temp <- datset
  cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
  aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
  spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
  infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))
  if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {

    nr <- .namesTorename(datset,
                         cf = cf,
                         aff = aff,
                         infraspp = infraspp)

    # Adjusting species labels when they have cf. or aff.
    # Adjusting species names with infraspecific taxa
    datset_temp <- .adjustnames(datset_temp,
                                cf = cf,
                                aff = aff,
                                infraspp = infraspp)
  }

  spp_labels <- lapply(datset_temp, function(x) x[[1]])

  # Check if there is any species with identifiers, then delete when
  # they differ among sequences in each gene
  spp_get <- list()
  for (i in seq_along(spp_labels)) {
    spp_get[[i]] <- grepl("_[[:lower:]]+_|_[[:lower:]]+[[:upper:]]_|_[[:lower:]]+[[:alnum:]]_", spp_labels[[i]])
  }
  if (any(unlist(spp_get))) {

    if (genomics) {
      ntrue <- lapply(spp_get, function(x) sum(x, na.rm = TRUE))
      if (any(length(spp_get[[1]]) == ntrue)) {
        n <- which(length(spp_get[[1]]) == ntrue)[1]
        all_spp_labels <- spp_labels[[n]]
      } else {
        temp <- list()
        for (i in seq_along(spp_get)){
          temp[[i]] <- spp_get[[i]][!spp_get[[1]]]
        }
        ntrue <- lapply(temp, function(x) sum(x, na.rm = TRUE))
        n <- which(sum(!spp_get[[1]], na.rm = TRUE) == ntrue)[1]
        all_spp_labels <- spp_labels[[n]]

      }

      for (i in seq_along(datset)) {
        datset[[i]][["species"]] <- all_spp_labels
      }

    } else {

    accessions <- list()
    difaccessions <- list()
    #spp_get <- list()
    for (i in seq_along(spp_labels)) {
      # Remove all before second underscore
      #spp_get[[i]] <- grepl("_[[:lower:]]+_", spp_labels[[i]])
      accessions[[i]] <- gsub("^([^_]+_){2}(.+)$", "\\2",
                              spp_labels[[i]])
    }

    for (i in 1:(length(spp_labels)-1)) {
      for (j in seq_along(spp_labels[[i]])) {
        if (spp_labels[[1]][[j]] != spp_labels[[i+1]][[j]]) {
          if (spp_get[[1]][[j]] == TRUE) {
            spp_labels[[1]][[j]] <- gsub(paste0("_", accessions[[1]][[j]]),
                                         paste0("_", gsub("_.*", "", accessions[[1]][[j]])),
                                         spp_labels[[1]][[j]])
          }
        }
      }
    }

    for (i in 2:length(spp_labels)) {
      for (j in seq_along(spp_labels[[i]])) {
        if (spp_labels[[i]][[j]] != spp_labels[[1]][[j]]) {
          if (spp_get[[i]][[j]] == TRUE) {
            spp_labels[[i]][[j]] <- gsub(paste0("_", accessions[[i]][[j]]),
                                         paste0("_", gsub("_.*", "", accessions[[i]][[j]])),
                                         spp_labels[[i]][[j]])
          }
        }
      }
    }


    for (i in seq_along(datset)) {
      datset[[i]][["species"]] <- spp_labels[[i]]
    }

    # Get number of missing data "?" in each sequence and then erase identifiers
    # in non-duplicated species from all gene datasets
    missdata <- vector()
    for (j in seq_along(datset)) {
      for (i in seq_along(datset[[j]][["species"]])) {
        missdata[i] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
      }
      datset[[j]] <- tibble::add_column(datset[[j]], missites = missdata, .after = "species")
    }


    datset_temp <- lapply(datset, .shortaxlabels)
    dupp <- c(duplicated(datset_temp[[1]][,"species"], fromLast = TRUE) |
                duplicated(datset_temp[[1]][,"species"]))

    if(any(dupp)){
    n <- vector()
    for (j in seq_along(datset)) {
      for (i in seq_along(datset[[j]][["species"]][!dupp])) {

        #lapply(datset, function(x) x[["species"]][!dupp][i])
        if (any(lapply(datset, function(x) x[["missites"]][!dupp][i]) %in%
                lapply(datset, function(x) nchar(x[["sequence"]][1]))) &
            !all(lapply(datset, function(x) x[["missites"]][!dupp][i]) %in%
                lapply(datset, function(x) nchar(x[["sequence"]][1])))) {

          n[i] <- as.vector(unlist(lapply(datset, function(x) gsub("(_[^_]+)_.*", "\\1", x[["species"]][!dupp][i]))[1]))
         # I need to now how to subset a list of dataframes using a list of TRUE/FALSE
        }
      }
      datset[[j]] <- datset[[j]] %>% select(c("species", "sequence"))
    }

    # Changing terminal names in just the first dataset, since we will not use names from the other datasets
    g <- grepl(paste0(n[!is.na(n)], collapse = "|"), datset[[1]][["species"]][!dupp])

    datset[[1]][["species"]][!dupp][g] <- gsub("(_[^_]+)_.*", "\\1", datset[[1]][["species"]][!dupp][g])

    } else {

      n <- gsub("(_[^_]+)_.*", "\\1", datset[[1]][["species"]])
      for (j in 2:length(datset)) {
        g <- datset[[j]][["species"]] %in% datset[[1]][["species"]]
        datset[[1]][["species"]][!g] <- n[!g]
        datset[[1]] <- datset[[1]] %>% select(c("species", "sequence"))
        datset[[j]] <- datset[[j]] %>% select(c("species", "sequence"))
      }
    }
}
    # Putting back the names under cf. and aff.
    # Adjusting names with infraspecific taxa
    datset <- .namesback(datset,
                         cf = cf,
                         aff = aff,
                         infraspp = infraspp,
                         rename_cf = nr[["rename_cf"]],
                         rename_aff = nr[["rename_aff"]],
                         rename_infraspp = nr[["rename_infraspp"]],
                         shortaxlabel = FALSE,
                         multispp = FALSE)
  }

  # Calculating the space between the taxon labels and corresponding DNA sequence for each dataset
  # First calculate how many letters in each taxon in each dataset
  letrs_taxlabs <- lapply(datset, function(x) nchar(x$species))

  # Setting the maximum of space based on the largest taxonlabel
  maxletrs_taxlabs <- lapply(letrs_taxlabs, function(x) max(x)+5) # Just increase this last number if we want to add more space
  maxletrs_taxlabs <- max(unlist(maxletrs_taxlabs))

  # We can also pad a string inside a list of dataframes by adding numbers or names at any position in the specific column
  f_b <- function(x, y) {
    for (i in 1:length(x$species)) {
      x$species[i] <- paste0(x$species[i], paste0(rep(" ", y - nchar(x$species[i])),
                                                  collapse = ""))
    }
    x
  }
  datset <- lapply(datset, f_b,
                   y = maxletrs_taxlabs)

  # Replace terminal GAPs into missing character (?)
  if (endgaps.to.miss) {
    datset <- lapply(datset, .replace_terminal_gaps)
  }

  # Uniting the two columns inside just the first dataframe
  datset[[1]] <- tidyr::unite(datset[[1]], "sequences", colnames(datset[[1]]), sep = " ")

  # Deleting the species collumn in the remaining dataset
  for (i in 2:numberdatset){
    datset[[i]] <- data.frame(sequences=datset[[i]][,2])
  }

  ex_datset <- do.call("cbind", datset)
  names(ex_datset) <- paste("sequences", 1:length(ex_datset), sep = "")

  # Uniting all columns
  genes <- tidyr::unite(ex_datset, "sequences", sep = "")

  genes$sequences <- as.character(genes$sequences)

  cat("Concatenating the genes...", "", sep = "\n")

  #n <- dim(genes)[1]
  #genes <- genes[1:(n-3),] # To remove few paragraphs after the matrix block and before the "; END;"
  colnames(genes) <- paste(dimensions,
                           sep = "\n")

  if (catalignments) {
    # Writing the concatenated dataset
    zz <- file(file, "w")
    write.table(genes, zz,
                append = FALSE, quote = FALSE, sep = " ",
                eol = "\n", na = "", dec = ".", row.names = FALSE,
                col.names = TRUE)
    close(zz)
  }


  # Creating the txt file for a mixed/partitioned model in RAxML
  partitionnames <- paste0("gene", 1:length(genenames))
  charsets <- paste(rep("DNA,", times = numberdatset), partitionnames, "= ")

  # Calculating the end of each gene with a cumulative sum of each gene size
  endgene <- paste(cumsum(series_numbchar))

  # Calculating the begining of each gene
  startgene <- lapply(as.numeric(endgene), function(x) x+1)

  # Dropping last value in the list
  startgene <- startgene[1:(length(startgene)-1)]

  # Insert the number "1" in the first position of the list
  startgene <- paste(R.utils::insert(startgene, ats = 1, values = 1))

  partitions <- paste0(charsets,
                       startgene,
                       rep("-", times = numberdatset),
                       endgene,
                       sep = "\n")
  partitions <- paste(partitions, collapse = "")
  partitions <- data.frame(partitions)


  if (partitionfile) {
    # Writing the partition file for RAxML concatenated analysis
    if (catalignments) {
    zy <- file(paste0(sub("[.].*", "", file), "_partition_file.txt"), "w")
    } else {
    zy <- file(file, "w")
    }
    write.table(partitions, zy,
                append = FALSE, quote = FALSE, sep = " ",
                eol = "\n", na = "", dec = ".", row.names = FALSE,
                col.names = FALSE)
    close(zy)

  }

  cat("Gene concatenation is finished!", "",
      sep = "\n")

  if (!catalignments & partitionfile) {
    cat("But you downloaded just the partition file...",
        sep = "\n")
    cat("Run the function again, making sure to have \"catalignments = TRUE\".",
        sep = "\n")
  }
}
domingoscardoso/catGenes documentation built on March 14, 2024, 9:21 p.m.