#' Writes a concatenated (non-interleaved) or combined (interleaved) NEXUS-formatted
#' dataset for multigene phylogenetic analysis
#'
#' @author Domingos Cardoso, Quezia Cavalcante, and Bruno Vilela
#'
#' @description Writes a final concatenated (non-interleaved) or combined (interleaved)
#' NEXUS-formatted dataset for multigene phylogenetic analysis, from the resulting list
#' of equally-sized dataframes containing the input individual DNA alignments, as generated by
#' the functions \code{\link{catfullGenes}} or \code{\link{catmultGenes}}. Depending
#' on the chosen arguments the final NEXUS-formatted file will or will not include
#' a preliminary command block (including the charset of each partition).
#'
#' @usage
#' writeNexus(x,
#' file,
#' genomics = FALSE,
#' interleave = TRUE,
#' bayesblock = TRUE,
#' endgaps.to.miss = TRUE)
#'
#' @param x The object to be written, a list of equally-sized dataframes containing
#' the input gene datasets, as generated by \\code{\link{catfullGenes}} or
#' \code{\link{catmultGenes}}.
#'
#' @param file Either a character string naming a file or a \code{\link{connection}}
#' open for writing.
#'
#' @param genomics Logical, if \code{TRUE}, it will be helpful in phylogenomics
#' where all provided species identifiers (e.g. collector number and GenBank
#' acession numbers) will be kept in the resultant concatenated dataset. This means
#' the concatenated dataset will never be written with species including brackted
#' comments for identifiers that differ because at least one species is missing
#' data for any gene.
#'
#' @param interleave Logical, if \code{FALSE} will fully concatenate the DNA alignments
#' without individually distinguishing each gene dataset as separate named partitions
#' within the complete matrix.
#'
#' @param bayesblock Logical, if \code{FALSE} will not create a preliminary Mr.Bayes
#' command block including the charsets showing the number of characters of each
#' individual gene partition.
#'
#' @param endgaps.to.miss Logical, if \code{FALSE} the function will not replace
#' terminal GAPs into missing character (?).
#'
#' @seealso \code{\link{catfullGenes}}
#' @seealso \code{\link{catmultGenes}}
#'
#' @examples \dontrun{
#' data(Gaya)
#' df <- catfullGenes(Gaya,
#' shortaxlabel = TRUE,
#' missdata = FALSE,
#' outgroup = "Abutilon_costicalyx")
#'
#' writeNexus(df,
#' file = "filename.nex",
#' genomics = FALSE,
#' interleave = TRUE,
#' bayesblock = TRUE)
#' }
#'
#' @importFrom dplyr bind_rows
#' @importFrom R.utils insert
#' @importFrom tidyr unite
#' @importFrom utils write.table
#'
#' @export
writeNexus <- function(x, file,
genomics = FALSE,
interleave = TRUE,
bayesblock = TRUE,
endgaps.to.miss = TRUE) {
datset <- .namedlist(x)
numberinputdatset <- length(datset)
# Unlist and rename an input list from genefullcomp function
if (numberinputdatset == 1) {
datset <- unlist(datset, recursive = FALSE)
names(datset) <- gsub(".*[.]", "", names(datset))
for (i in 1:length(datset)) {
datset[[i]]$species <- as.character(datset[[i]]$species)
datset[[i]]$sequence <- as.character(datset[[i]]$sequence)}
}
numberdatset <- length(datset)
if (numberdatset == 1) {
stop("You must provide a file generated by the genefullcomp
Find help also at DBOSLab-UFBA (Domingos Cardoso; cardosobot@gmail.com)")
}
genenames <- names(datset)
cat("You are combining the following gene datasets:", "", sep = "\n")
cat(cat(genenames, sep = ", "), "", sep = "\n")
cat(cat("Total number of datasets:", numberdatset), "", sep = "\n")
numbchar <- lapply(datset, function(x) nchar(x[1, 2]))
# Making all separate dataset within the list the same size to transforme into a dataframe with NAs whenever there is no sequence
#datasets <- lapply(datasets, `length<-`, max(lengths(datasets)))
writtenby <- paste("[catGenes (DBOSLab-UFBA), ", date(), "]\n\n", sep = "")
nexus <- paste("#NEXUS", "", sep = "\n")
begindata <- paste("BEGIN DATA;")
dimname <- paste("DIMENSIONS")
ntax <- paste0("NTAX=", length(rownames(datset[[1]])))
total_numbchar <- sum(unlist(numbchar))
series_numbchar <- paste(lapply(numbchar, function(x) unlist(x)))
list_nchars1 <- paste0(genenames,
rep("=", times = numberdatset),
series_numbchar)
list_nchars2 <- paste(list_nchars1, collapse = " + ")
nchartotal <- paste0("NCHAR=",
total_numbchar, "; [",
list_nchars2,
"]")
dimensions <- paste(dimname, ntax, nchartotal, sep = " ")
formatinter <- paste("FORMAT DATATYPE=DNA GAP=- MISSING=? INTERLEAVE=YES;")
format <- paste("FORMAT DATATYPE=DNA GAP=- MISSING=? INTERLEAVE=NO;")
cat("Your final dataset will have these DIMENSIONS:", "", sep = "\n")
cat(nchartotal, "", sep = "\n")
cat(ntax, "", sep = "\n")
matrix <- paste("MATRIX", "", sep = "\n")
# Finding identifiers that differ among the sequences and across genes
datset_temp <- datset
cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))
if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
nr <- .namesTorename(datset,
cf = cf,
aff = aff,
infraspp = infraspp)
# Adjusting species labels when they have cf. or aff.
# Adjusting species names with infraspecific taxa
datset_temp <- .adjustnames(datset_temp,
cf = cf,
aff = aff,
infraspp = infraspp)
}
spp_labels <- lapply(datset_temp, function(x) x[[1]])
# Check if there is any species with identifiers, then put in brackets when
# they differ among sequences in each gene
spp_get <- list()
for (i in seq_along(spp_labels)) {
spp_get[[i]] <- grepl("_[[:lower:]]+_|_[[:lower:]]+[[:upper:]]_|_[[:lower:]]+[[:alnum:]]_", spp_labels[[i]])
}
if (any(unlist(spp_get))) {
if (genomics) {
ntrue <- lapply(spp_get, function(x) sum(x, na.rm = TRUE))
if (any(length(spp_get[[1]]) == ntrue)) {
n <- which(length(spp_get[[1]]) == ntrue)[1]
all_spp_labels <- spp_labels[[n]]
} else {
temp <- list()
for (i in seq_along(spp_get)){
temp[[i]] <- spp_get[[i]][!spp_get[[1]]]
}
ntrue <- lapply(temp, function(x) sum(x, na.rm = TRUE))
n <- which(sum(!spp_get[[1]], na.rm = TRUE) == ntrue)[1]
all_spp_labels <- spp_labels[[n]]
}
for (i in seq_along(datset)) {
datset[[i]][["species"]] <- all_spp_labels
}
} else {
accessions <- list()
difaccessions <- list()
#spp_get <- list()
for (i in seq_along(spp_labels)) {
# Remove all before second underscore
#spp_get[[i]] <- grepl("_[[:lower:]]+_", spp_labels[[i]])
accessions[[i]] <- gsub("^([^_]+_){2}(.+)$", "\\2",
spp_labels[[i]])
}
for (i in 1:(length(spp_labels)-1)) {
for (j in seq_along(spp_labels[[i]])) {
if (spp_labels[[1]][[j]] != spp_labels[[i+1]][[j]]) {
if (spp_get[[1]][[j]] == TRUE) {
spp_labels[[1]][[j]] <- gsub(paste0("_", accessions[[1]][[j]]),
paste0(" [", accessions[[1]][[j]], "]"),
spp_labels[[1]][[j]])
}
}
}
}
for (i in 2:length(spp_labels)) {
for (j in seq_along(spp_labels[[i]])) {
if (spp_labels[[i]][[j]] != spp_labels[[1]][[j]]) {
if (spp_get[[i]][[j]] == TRUE) {
spp_labels[[i]][[j]] <- gsub(paste0("_", accessions[[i]][[j]]),
paste0(" [", accessions[[i]][[j]], "]"),
spp_labels[[i]][[j]])
}
}
}
}
for (i in seq_along(datset)) {
datset[[i]][["species"]] <- spp_labels[[i]]
}
datset_temp <- lapply(datset_temp, .shortaxlabels)
dupp <- list()
for (i in seq_along(datset_temp)) {
dupp[[i]] <- c(duplicated(datset_temp[[i]][,"species"], fromLast = TRUE) |
duplicated(datset_temp[[i]][,"species"]))
}
if (any(dupp[[1]] == TRUE)) {
for (i in seq_along(datset_temp)) {
f <- grepl("\\s", datset[[i]][["species"]][dupp[[i]]])
g <- gsub(".*?\\s", "", datset[[i]][["species"]][dupp[[i]]][f])
h <- grepl("[_]", g)
if (any(h)) {
ns <- gsub("\\s.*", "", datset[[i]][["species"]][dupp[[i]]][f][h])
g[h] <- sub("_", " [", g[h]) # Remove first occurrence of underscore in a string
g[h] <- gsub("^[[]", "_", g[h])
datset[[i]][["species"]][dupp[[i]]][f][h] <- paste0(ns, g[h])
datset[[i]][["species"]][dupp[[i]]][f][!h] <-
gsub("\\s", "_", datset[[i]][["species"]][dupp[[i]]][f][!h])
datset[[i]][["species"]][dupp[[i]]][f][!h] <-
gsub("[[]|[]]", "", datset[[i]][["species"]][dupp[[i]]][f][!h])
} else {
datset[[i]][["species"]][dupp[[i]]][f] <-
gsub("\\s", "_", datset[[i]][["species"]][dupp[[i]]][f])
datset[[i]][["species"]][dupp[[i]]][f] <-
gsub("[[]|[]]", "", datset[[i]][["species"]][dupp[[i]]][f])
}
}
}
}
# Putting back the names under cf. and aff.
# Adjusting names with infraspecific taxa
datset <- .namesback(datset,
cf = cf,
aff = aff,
infraspp = infraspp,
rename_cf = nr[["rename_cf"]],
rename_aff = nr[["rename_aff"]],
rename_infraspp = nr[["rename_infraspp"]],
shortaxlabel = FALSE,
multispp = FALSE)
}
# Calculating the space between the taxon labels and corresponding DNA sequence for each dataset
# First calculate how many letters there are in each taxon in each dataset
letrs_taxlabs <- lapply(datset, function(x) nchar(x$species))
# Setting the maximum of space based on the largest taxonlabel
maxletrs_taxlabs <- lapply(letrs_taxlabs, function(x) max(x)+5) #just increase this last number if we want to add more space
maxletrs_taxlabs <- max(unlist(maxletrs_taxlabs))
# Pad a string inside a list of dataframes by adding numbers or names at any
# position in the specific column.
f_b <- function(x, y) {
for (i in 1:length(x$species)) {
x$species[i] <- paste0(x$species[i], paste0(rep(" ", y - nchar(x$species[i])),
collapse = ""))
}
x
}
datset <- lapply(datset, f_b,
y = maxletrs_taxlabs)
# Replace terminal GAPs into missing character (?)
if (endgaps.to.miss) {
datset <- lapply(datset, .replace_terminal_gaps)
}
if (interleave) {
cat("Combining genes as interleave...", "", sep = "\n")
# Uniting the two columns inside a list of dataframes
datset <- lapply(datset, function(x) tidyr::unite(x, "sequences", colnames(x), sep = " "))
# Creating the scale of char length for each dataset
datset <- .charScale(datset,
numbchar = numbchar,
interleave = interleave,
genenames = genenames)
# Adding the gene name and two extra spaces below each gene dataset
ex_datset <- mapply(function(x, y) rbind(y, x, "", ""),
x = datset, y = paste0("[", as.list(genenames), "]"),
SIMPLIFY = FALSE)
# Combining the dataframes inside the list into a single dataframe
genes <- data.frame(dplyr::bind_rows(ex_datset, .id = NULL)) # Somehow it was not working with .id="sequences" so I had to exclude the next line
#genes <- data.frame(genes[, 2])
colnames(genes) <- "sequences"
genes$sequences <- as.character(genes$sequences)
# Writing the data command block
#n <- dim(genes)[1]
#genes <- genes[1:(n-3),] # To remove few paragraphs after the matrix block and before the "; END;"
end <- paste(";","END;", sep = "\n")
genes[nrow(genes) + 1, ] <- end
colnames(genes) <- paste(nexus,
writtenby,
begindata,
dimensions,
formatinter,
matrix,
sep = "\n")
} else {
cat("Concatenating genes in full, non-interleaved alignment...", "", sep = "\n")
# Uniting the two columns inside just the first dataframe
datset[[1]] <- tidyr::unite(datset[[1]], "sequences", colnames(datset[[1]]), sep = " ")
# Deleting the species column in the remaining dataset
for (i in 2:numberdatset) {
datset[[i]] <- data.frame(sequences=datset[[i]][,2])
}
ex_datset <- do.call("cbind", datset)
names(ex_datset) <- paste("sequences", 1:length(ex_datset), sep = "")
# Uniting all columns
genes <- tidyr::unite(ex_datset, "sequences", sep = "")
genes$sequences <- as.character(genes$sequences)
# Creating the scale of char length for the entire concatenated matrix
genes <- .charScale(genes,
numbchar = numbchar,
interleave = interleave,
genenames = genenames)
# Writing the data command block
#n <- dim(genes)[1]
#genes <- genes[1:(n-3),] #to remove few paragraphs after the matrix block and before the "; END;"
end <- paste(";","END;", sep = "\n")
genes[nrow(genes) + 1, ] <- end
colnames(genes) <- paste(nexus,
writtenby,
begindata,
dimensions,
format,
matrix,
sep = "\n")
}
if (bayesblock) {
cat(cat("Building a preliminary Mr.Bayes command block...", "", sep = "\n"))
genes[nrow(genes) + 2, ] <- ""
mrbayes <- paste("begin mrbayes;",
"set autoclose=yes;",
sep = "\n")
genes[nrow(genes) + 2, ] <- mrbayes
out_tax <- paste("outgroup WRITE_YOUR_OUTGROUP;",
sep = "\n")
genes[nrow(genes) + 2, ] <- out_tax
# Creating the charset block
list_nchars3 <- paste0(paste(sapply(numbchar, function(x) unlist(x))),
rep("=", times = numberdatset), genenames)
charsets <- paste(rep("charset", times = numberdatset), genenames, "= ")
# Calculating the end of each gene with a cumulative sum of each gene size
endgene <- paste(cumsum(series_numbchar))
# Calculating the begining of each gene
startgene <- lapply(as.numeric(endgene), function(x) x+1)
# Dropping last value in the list
startgene <- startgene[1:(length(startgene)-1)]
# Insert the number "1" in the first position of the list
startgene <- paste(R.utils::insert(startgene, ats = 1, values = 1))
charsetsblock <- paste0(charsets,
startgene,
rep("-", times = numberdatset),
endgene,
paste0("; [", list_nchars3, "]"),
sep = "\n")
charsetsblock <- paste(charsetsblock, collapse = "")
genes[nrow(genes) + 2, ] <- paste(charsetsblock)
partition <- paste(paste0("partition regions = ", numberdatset, ": ",
paste(genenames, collapse = ", "), ";"),
"set partition = regions;",
sep = "\n")
genes[nrow(genes) + 2, ] <- partition
sitesout <- paste("[IF YOU WANT TO EXCLUDE AN SPECIFIC PARTITION OR a NUMBER OF SITES INSIDE THE MATRIX...]",
"[UNBRACKET THE FOLLOWING:]",
"[exclude gene1;]",
"[exclude 20-30 234-456;] [NOTE THAT YOU ARE DELETING TWO BLOCKS OF SITES IN THE MATRIX]",
sep = "\n")
genes[nrow(genes) + 2, ] <- sitesout
taxout <- paste("[IF YOU WANT TO DELETE AN SPECIFIC SET OF TAXA OR AN SPECIFIC TAXA...]",
"[UNBRACKET THE FOLLOWING:]",
"[taxaset genus1 = 10-30;] [HERE YOU HAVE CREATED A TAXASET NAMED AS GENUS1 FOR TAXA/SEQUENCES 10-30]",
"[delete = genus1;] [NOW YOU ARE DELETING ALL TAXA/SEQUENCES FROM 10-30]",
"[delete = 1 4 8;] [NOW YOU ARE DELETING JUST THE TAXA/SEQUENCES 1, 4 and 8.
YOU COULD ALSO USE THE ACTUAL NAME OF THE TAXA/TERMINAL AS IN THE MATRIX INSTEAD OF THEIR POSITION.
MAKE SURE THE NAMES OR POSITIONS OF THE TAXA TO BE DELETED ARE NOT SEPARATE BY COMMA]",
sep = "\n")
genes[nrow(genes) + 2, ] <- taxout
lset <- paste("lset applyto=(1) nst=6 rates=invgamma; [CHANGE THE PARAMETERS ACCORDINGLY]",
"lset applyto=(2,3) nst=6 rates=gamma; [CHANGE THE PARAMETERS ACCORDINGLY]",
sep="\n")
genes[nrow(genes) + 2, ] <- lset
prset <- paste("unlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all); [CHANGE THE PARAMETERS ACCORDINGLY]",
"prset applyto=(all) ratepr=variable; [CHANGE THE PARAMETERS ACCORDINGLY]",
sep = "\n")
genes[nrow(genes) + 2, ] <- prset
mcmc <- paste("[SET YOUR MCMC PARAMETERS ACCORDINGLY]",
"mcmc ngen=10000000 printfreq=10000 samplefreq=10000 nchains=8
nruns=2 savebrlens=yes filename=WRITE_YOUR_TAXON_comb_Bayesian;",
sep = "\n")
genes[nrow(genes) + 2, ] <- mcmc
log <- paste("log start filename=WRITE_YOUR_TAXON_comb_Bayesian.log append;")
genes[nrow(genes) + 2, ] <- log
sump <- paste("sump;",
"sumt relburnin=yes burninfrac=0.25;",
"end;",
sep = "\n")
genes[nrow(genes) + 2, ] <- sump
}
zz <- file(file, "w")
write.table(genes, zz,
append = FALSE, quote = FALSE, sep = " ",
eol = "\n", na = "", dec = ".", row.names = FALSE,
col.names = TRUE)
cat("Gene concatenation is finished!", "",
sep = "\n")
close(zz)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.