R/writeNexus.R

Defines functions writeNexus

Documented in writeNexus

#' Writes a concatenated (non-interleaved) or combined (interleaved) NEXUS-formatted
#' dataset for multigene phylogenetic analysis
#'
#' @author Domingos Cardoso, Quezia Cavalcante, and Bruno Vilela
#'
#' @description Writes a final concatenated (non-interleaved) or combined (interleaved)
#' NEXUS-formatted dataset for multigene phylogenetic analysis, from the resulting list
#' of equally-sized dataframes containing the input individual DNA alignments, as generated by
#' the functions \code{\link{catfullGenes}} or \code{\link{catmultGenes}}. Depending
#' on the chosen arguments the final NEXUS-formatted file will or will not include
#' a preliminary command block (including the charset of each partition).
#'
#' @usage
#' writeNexus(x,
#'            file,
#'            genomics = FALSE,
#'            interleave = TRUE,
#'            bayesblock = TRUE,
#'            endgaps.to.miss = TRUE)
#'
#' @param x The object to be written, a list of equally-sized dataframes containing
#' the input gene datasets, as generated by \\code{\link{catfullGenes}} or
#' \code{\link{catmultGenes}}.
#'
#' @param file Either a character string naming a file or a \code{\link{connection}}
#'  open for writing.
#'
#' @param genomics Logical, if \code{TRUE}, it will be helpful in phylogenomics
#' where all provided species identifiers (e.g. collector number and GenBank
#' acession numbers) will be kept in the resultant concatenated dataset. This means
#' the concatenated dataset will never be written with species including brackted
#' comments for identifiers that differ because at least one species is missing
#' data for any gene.
#'
#' @param interleave Logical, if \code{FALSE} will fully concatenate the DNA alignments
#' without individually distinguishing each gene dataset as separate named partitions
#' within the complete matrix.
#'
#' @param bayesblock Logical, if \code{FALSE} will not create a preliminary Mr.Bayes
#' command block including the charsets showing the number of characters of each
#' individual gene partition.
#'
#' @param endgaps.to.miss Logical, if \code{FALSE} the function will not replace
#' terminal GAPs into missing character (?).
#'
#' @seealso \code{\link{catfullGenes}}
#' @seealso \code{\link{catmultGenes}}
#'
#' @examples \dontrun{
#' data(Gaya)
#' df <- catfullGenes(Gaya,
#'                    shortaxlabel = TRUE,
#'                    missdata = FALSE,
#'                    outgroup = "Abutilon_costicalyx")
#'
#' writeNexus(df,
#'            file = "filename.nex",
#'            genomics = FALSE,
#'            interleave = TRUE,
#'            bayesblock = TRUE)
#' }
#'
#' @importFrom dplyr bind_rows
#' @importFrom R.utils insert
#' @importFrom tidyr unite
#' @importFrom utils write.table
#'
#' @export

writeNexus <- function(x, file,
                       genomics = FALSE,
                       interleave = TRUE,
                       bayesblock = TRUE,
                       endgaps.to.miss = TRUE) {

  datset <- .namedlist(x)

  numberinputdatset <- length(datset)

  # Unlist and rename an input list from genefullcomp function
  if (numberinputdatset == 1) {
    datset <- unlist(datset, recursive = FALSE)
    names(datset) <- gsub(".*[.]", "", names(datset))

    for (i in 1:length(datset)) {
      datset[[i]]$species <- as.character(datset[[i]]$species)
      datset[[i]]$sequence <- as.character(datset[[i]]$sequence)}
  }

  numberdatset <- length(datset)

  if (numberdatset == 1) {
    stop("You must provide a file generated by the genefullcomp
          Find help also at DBOSLab-UFBA (Domingos Cardoso; cardosobot@gmail.com)")
  }

  genenames <- names(datset)

  cat("You are combining the following gene datasets:", "", sep = "\n")
  cat(cat(genenames, sep = ", "), "", sep = "\n")
  cat(cat("Total number of datasets:", numberdatset), "", sep = "\n")

  numbchar <- lapply(datset, function(x) nchar(x[1, 2]))

  # Making all separate dataset within the list the same size to transforme into a dataframe with NAs whenever there is no sequence
  #datasets <- lapply(datasets, `length<-`, max(lengths(datasets)))

  writtenby <- paste("[catGenes (DBOSLab-UFBA), ", date(), "]\n\n", sep = "")
  nexus <- paste("#NEXUS", "", sep = "\n")
  begindata <- paste("BEGIN DATA;")
  dimname <- paste("DIMENSIONS")
  ntax <- paste0("NTAX=", length(rownames(datset[[1]])))

  total_numbchar <- sum(unlist(numbchar))
  series_numbchar <- paste(lapply(numbchar, function(x) unlist(x)))
  list_nchars1 <- paste0(genenames,
                         rep("=", times = numberdatset),
                         series_numbchar)
  list_nchars2 <- paste(list_nchars1, collapse = " + ")

  nchartotal <- paste0("NCHAR=",
                       total_numbchar, "; [",
                       list_nchars2,
                       "]")
  dimensions <- paste(dimname, ntax, nchartotal, sep = " ")
  formatinter <- paste("FORMAT DATATYPE=DNA GAP=- MISSING=? INTERLEAVE=YES;")
  format <- paste("FORMAT DATATYPE=DNA GAP=- MISSING=? INTERLEAVE=NO;")

  cat("Your final dataset will have these DIMENSIONS:", "", sep = "\n")
  cat(nchartotal, "", sep = "\n")
  cat(ntax, "", sep = "\n")

  matrix <- paste("MATRIX", "", sep = "\n")

  # Finding identifiers that differ among the sequences and across genes
  datset_temp <- datset
  cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
  aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
  spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
  infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))

  if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {

    nr <- .namesTorename(datset,
                         cf = cf,
                         aff = aff,
                         infraspp = infraspp)

    # Adjusting species labels when they have cf. or aff.
    # Adjusting species names with infraspecific taxa
    datset_temp <- .adjustnames(datset_temp,
                                cf = cf,
                                aff = aff,
                                infraspp = infraspp)
  }

  spp_labels <- lapply(datset_temp, function(x) x[[1]])

  # Check if there is any species with identifiers, then put in brackets when
  # they differ among sequences in each gene
  spp_get <- list()
  for (i in seq_along(spp_labels)) {
    spp_get[[i]] <- grepl("_[[:lower:]]+_|_[[:lower:]]+[[:upper:]]_|_[[:lower:]]+[[:alnum:]]_", spp_labels[[i]])
  }
  if (any(unlist(spp_get))) {

    if (genomics) {
      ntrue <- lapply(spp_get, function(x) sum(x, na.rm = TRUE))
      if (any(length(spp_get[[1]]) == ntrue)) {
        n <- which(length(spp_get[[1]]) == ntrue)[1]
        all_spp_labels <- spp_labels[[n]]
      } else {
        temp <- list()
        for (i in seq_along(spp_get)){
          temp[[i]] <- spp_get[[i]][!spp_get[[1]]]
        }
        ntrue <- lapply(temp, function(x) sum(x, na.rm = TRUE))
        n <- which(sum(!spp_get[[1]], na.rm = TRUE) == ntrue)[1]
        all_spp_labels <- spp_labels[[n]]

      }

      for (i in seq_along(datset)) {
        datset[[i]][["species"]] <- all_spp_labels
      }

    } else {

      accessions <- list()
      difaccessions <- list()
      #spp_get <- list()
      for (i in seq_along(spp_labels)) {
        # Remove all before second underscore
        #spp_get[[i]] <- grepl("_[[:lower:]]+_", spp_labels[[i]])
        accessions[[i]] <- gsub("^([^_]+_){2}(.+)$", "\\2",
                                spp_labels[[i]])
      }

      for (i in 1:(length(spp_labels)-1)) {
        for (j in seq_along(spp_labels[[i]])) {
          if (spp_labels[[1]][[j]] != spp_labels[[i+1]][[j]]) {
            if (spp_get[[1]][[j]] == TRUE) {
              spp_labels[[1]][[j]] <- gsub(paste0("_", accessions[[1]][[j]]),
                                           paste0(" [", accessions[[1]][[j]], "]"),
                                           spp_labels[[1]][[j]])
            }
          }
        }
      }

      for (i in 2:length(spp_labels)) {
        for (j in seq_along(spp_labels[[i]])) {
          if (spp_labels[[i]][[j]] != spp_labels[[1]][[j]]) {
            if (spp_get[[i]][[j]] == TRUE) {
              spp_labels[[i]][[j]] <- gsub(paste0("_", accessions[[i]][[j]]),
                                           paste0(" [", accessions[[i]][[j]], "]"),
                                           spp_labels[[i]][[j]])
            }
          }
        }
      }


      for (i in seq_along(datset)) {
        datset[[i]][["species"]] <- spp_labels[[i]]
      }

      datset_temp <- lapply(datset_temp, .shortaxlabels)
      dupp <- list()
      for (i in seq_along(datset_temp)) {
        dupp[[i]] <- c(duplicated(datset_temp[[i]][,"species"], fromLast = TRUE) |
                         duplicated(datset_temp[[i]][,"species"]))
      }

      if (any(dupp[[1]] == TRUE)) {

        for (i in seq_along(datset_temp)) {

          f <- grepl("\\s", datset[[i]][["species"]][dupp[[i]]])
          g <- gsub(".*?\\s", "", datset[[i]][["species"]][dupp[[i]]][f])
          h <- grepl("[_]", g)

          if (any(h)) {

            ns <- gsub("\\s.*", "", datset[[i]][["species"]][dupp[[i]]][f][h])

            g[h] <- sub("_", " [", g[h]) # Remove first occurrence of underscore in a string
            g[h] <- gsub("^[[]", "_", g[h])

            datset[[i]][["species"]][dupp[[i]]][f][h] <- paste0(ns, g[h])

            datset[[i]][["species"]][dupp[[i]]][f][!h] <-
              gsub("\\s", "_", datset[[i]][["species"]][dupp[[i]]][f][!h])
            datset[[i]][["species"]][dupp[[i]]][f][!h] <-
              gsub("[[]|[]]", "", datset[[i]][["species"]][dupp[[i]]][f][!h])

          } else {

            datset[[i]][["species"]][dupp[[i]]][f] <-
              gsub("\\s", "_", datset[[i]][["species"]][dupp[[i]]][f])
            datset[[i]][["species"]][dupp[[i]]][f] <-
              gsub("[[]|[]]", "", datset[[i]][["species"]][dupp[[i]]][f])
          }
        }
      }
    }

    # Putting back the names under cf. and aff.
    # Adjusting names with infraspecific taxa
    datset <- .namesback(datset,
                         cf = cf,
                         aff = aff,
                         infraspp = infraspp,
                         rename_cf = nr[["rename_cf"]],
                         rename_aff = nr[["rename_aff"]],
                         rename_infraspp = nr[["rename_infraspp"]],
                         shortaxlabel = FALSE,
                         multispp = FALSE)
  }

  # Calculating the space between the taxon labels and corresponding DNA sequence for each dataset
  # First calculate how many letters there are in each taxon in each dataset
  letrs_taxlabs <- lapply(datset, function(x) nchar(x$species))

  # Setting the maximum of space based on the largest taxonlabel
  maxletrs_taxlabs <- lapply(letrs_taxlabs, function(x) max(x)+5) #just increase this last number if we want to add more space
  maxletrs_taxlabs <- max(unlist(maxletrs_taxlabs))

  # Pad a string inside a list of dataframes by adding numbers or names at any
  # position in the specific column.
  f_b <- function(x, y) {
    for (i in 1:length(x$species)) {
      x$species[i] <- paste0(x$species[i], paste0(rep(" ", y - nchar(x$species[i])),
                                                  collapse = ""))
    }
    x
  }
  datset <- lapply(datset, f_b,
                   y = maxletrs_taxlabs)

  # Replace terminal GAPs into missing character (?)
  if (endgaps.to.miss) {
  datset <- lapply(datset, .replace_terminal_gaps)
  }

  if (interleave) {
    cat("Combining genes as interleave...", "", sep = "\n")
    # Uniting the two columns inside a list of dataframes
    datset <- lapply(datset, function(x) tidyr::unite(x, "sequences", colnames(x), sep = " "))


    # Creating the scale of char length for each dataset
    datset <- .charScale(datset,
                         numbchar = numbchar,
                         interleave = interleave,
                         genenames = genenames)


    # Adding the gene name and two extra spaces below each gene dataset
    ex_datset <- mapply(function(x, y) rbind(y, x, "", ""),
                        x = datset, y = paste0("[", as.list(genenames), "]"),
                        SIMPLIFY = FALSE)

    # Combining the dataframes inside the list into a single dataframe
    genes <- data.frame(dplyr::bind_rows(ex_datset, .id = NULL)) # Somehow it was not working with .id="sequences" so I had to exclude the next line
    #genes <- data.frame(genes[, 2])
    colnames(genes) <- "sequences"
    genes$sequences <- as.character(genes$sequences)

    # Writing the data command block
    #n <- dim(genes)[1]
    #genes <- genes[1:(n-3),] # To remove few paragraphs after the matrix block and before the "; END;"
    end <- paste(";","END;", sep = "\n")
    genes[nrow(genes) + 1, ] <- end
    colnames(genes) <- paste(nexus,
                             writtenby,
                             begindata,
                             dimensions,
                             formatinter,
                             matrix,
                             sep = "\n")

  } else {
    cat("Concatenating genes in full, non-interleaved alignment...", "", sep = "\n")
    # Uniting the two columns inside just the first dataframe
    datset[[1]] <- tidyr::unite(datset[[1]], "sequences", colnames(datset[[1]]), sep = " ")

    # Deleting the species column in the remaining dataset
    for (i in 2:numberdatset) {
      datset[[i]] <- data.frame(sequences=datset[[i]][,2])
    }

    ex_datset <- do.call("cbind", datset)
    names(ex_datset) <- paste("sequences", 1:length(ex_datset), sep = "")

    # Uniting all columns
    genes <- tidyr::unite(ex_datset, "sequences", sep = "")
    genes$sequences <- as.character(genes$sequences)

    # Creating the scale of char length for the entire concatenated matrix
    genes <- .charScale(genes,
                        numbchar = numbchar,
                        interleave = interleave,
                        genenames = genenames)

    # Writing the data command block
    #n <- dim(genes)[1]
    #genes <- genes[1:(n-3),] #to remove few paragraphs after the matrix block and before the "; END;"
    end <- paste(";","END;", sep = "\n")
    genes[nrow(genes) + 1, ] <- end
    colnames(genes) <- paste(nexus,
                             writtenby,
                             begindata,
                             dimensions,
                             format,
                             matrix,
                             sep = "\n")

  }


  if (bayesblock) {
    cat(cat("Building a preliminary Mr.Bayes command block...", "", sep = "\n"))
    genes[nrow(genes) + 2, ] <- ""
    mrbayes <- paste("begin mrbayes;",
                     "set autoclose=yes;",
                     sep = "\n")
    genes[nrow(genes) + 2, ] <- mrbayes
    out_tax <- paste("outgroup WRITE_YOUR_OUTGROUP;",
                     sep = "\n")
    genes[nrow(genes) + 2, ] <- out_tax

    # Creating the charset block
    list_nchars3 <- paste0(paste(sapply(numbchar, function(x) unlist(x))),
                           rep("=", times = numberdatset), genenames)

    charsets <- paste(rep("charset", times = numberdatset), genenames, "= ")

    # Calculating the end of each gene with a cumulative sum of each gene size
    endgene <- paste(cumsum(series_numbchar))

    # Calculating the begining of each gene
    startgene <- lapply(as.numeric(endgene), function(x) x+1)

    # Dropping last value in the list
    startgene <- startgene[1:(length(startgene)-1)]

    # Insert the number "1" in the first position of the list
    startgene <- paste(R.utils::insert(startgene, ats = 1, values = 1))

    charsetsblock <- paste0(charsets,
                            startgene,
                            rep("-", times = numberdatset),
                            endgene,
                            paste0("; [", list_nchars3, "]"),
                            sep = "\n")
    charsetsblock <- paste(charsetsblock, collapse = "")

    genes[nrow(genes) + 2, ] <- paste(charsetsblock)

    partition <- paste(paste0("partition regions = ", numberdatset, ": ",
                              paste(genenames, collapse = ", "), ";"),
                       "set partition = regions;",
                       sep = "\n")
    genes[nrow(genes) + 2, ] <- partition

    sitesout <- paste("[IF YOU WANT TO EXCLUDE AN SPECIFIC PARTITION OR a NUMBER OF SITES INSIDE THE MATRIX...]",
                      "[UNBRACKET THE FOLLOWING:]",
                      "[exclude gene1;]",
                      "[exclude 20-30 234-456;] [NOTE THAT YOU ARE DELETING TWO BLOCKS OF SITES IN THE MATRIX]",
                      sep = "\n")
    genes[nrow(genes) + 2, ] <- sitesout

    taxout <- paste("[IF YOU WANT TO DELETE AN SPECIFIC SET OF TAXA OR AN SPECIFIC TAXA...]",
                    "[UNBRACKET THE FOLLOWING:]",
                    "[taxaset genus1 = 10-30;] [HERE YOU HAVE CREATED A TAXASET NAMED AS GENUS1 FOR TAXA/SEQUENCES 10-30]",
                    "[delete = genus1;] [NOW YOU ARE DELETING ALL TAXA/SEQUENCES FROM 10-30]",
                    "[delete = 1 4 8;] [NOW YOU ARE DELETING JUST THE TAXA/SEQUENCES 1, 4 and 8.
                    YOU COULD ALSO USE THE ACTUAL NAME OF THE TAXA/TERMINAL AS IN THE MATRIX INSTEAD OF THEIR POSITION.
                    MAKE SURE THE NAMES OR POSITIONS OF THE TAXA TO BE DELETED ARE NOT SEPARATE BY COMMA]",
                    sep = "\n")
    genes[nrow(genes) + 2, ] <- taxout

    lset <- paste("lset applyto=(1) nst=6 rates=invgamma; [CHANGE THE PARAMETERS ACCORDINGLY]",
                  "lset applyto=(2,3) nst=6 rates=gamma; [CHANGE THE PARAMETERS ACCORDINGLY]",
                  sep="\n")
    genes[nrow(genes) + 2, ] <- lset

    prset <- paste("unlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all); [CHANGE THE PARAMETERS ACCORDINGLY]",
                   "prset applyto=(all) ratepr=variable; [CHANGE THE PARAMETERS ACCORDINGLY]",
                   sep = "\n")
    genes[nrow(genes) + 2, ] <- prset

    mcmc <- paste("[SET YOUR MCMC PARAMETERS ACCORDINGLY]",
                  "mcmc ngen=10000000 printfreq=10000 samplefreq=10000 nchains=8
                  nruns=2 savebrlens=yes filename=WRITE_YOUR_TAXON_comb_Bayesian;",
                  sep = "\n")
    genes[nrow(genes) + 2, ] <- mcmc

    log <- paste("log start filename=WRITE_YOUR_TAXON_comb_Bayesian.log append;")
    genes[nrow(genes) + 2, ] <- log

    sump <- paste("sump;",
                  "sumt relburnin=yes burninfrac=0.25;",
                  "end;",
                  sep = "\n")
    genes[nrow(genes) + 2, ] <- sump
  }

  zz <- file(file, "w")
  write.table(genes, zz,
              append = FALSE, quote = FALSE, sep = " ",
              eol = "\n", na = "", dec = ".", row.names = FALSE,
              col.names = TRUE)

  cat("Gene concatenation is finished!", "",
      sep = "\n")

  close(zz)
}
domingoscardoso/catGenes documentation built on March 14, 2024, 9:21 p.m.