#' Writes a PHYLIP-formatted file of fully concatenated gene datasets and an
#' associated partition file for RAxML concatenated phylogenetic analysis
#'
#' @author Domingos Cardoso, Quezia Cavalcante, and Bruno Vilela
#'
#' @description From the resulting list of equally-sized dataframes containing the
#' input individual DNA alignments, as generated by the functions \code{\link{catfullGenes}}
#' or \code{\link{catmultGenes}}, it will write (i) a final PHYLIP-formatted file
#' of fully concatenated dataset and (ii) an associated partition file for
#' the RAxML concatenated phylogenetic analysis using a mixed/partitioned model.
#'
#' @usage
#' writePhylip(x,
#' file,
#' genomics = FALSE,
#' catalignments = TRUE,
#' partitionfile = TRUE)
#'
#' @param x The object to be written, a list of equally-sized dataframes containing
#' the input gene datasets, as generated by the functions \\code{\link{catfullGenes}}
#' or \code{\link{catmultGenes}}.
#'
#' @param file Either a character string naming a file or a \code{\link{connection}}
#' open for writing.
#'
#' @param genomics Logical, if \code{TRUE}, it will be helpful in phylogenomics
#' where all provided species identifiers (e.g. collector number and GenBank
#' acession numbers) will always be kept in the resultant concatenated dataset.
#'
#' @param catalignments Logical, if \code{FALSE} will not write the concatednated
#' matrix of DNA alinments. Better to keep it always as \code{TRUE}.
#'
#' @param partitionfile Logical, if \code{FALSE} will not write the associated
#' text-formatted partition file for the RAxML concatenated phylogenetic analysis
#' using a mixed/partitioned model.
#'
#' @param endgaps.to.miss Logical, if \code{FALSE} the function will not replace
#' terminal GAPs into missing character (?).
#'
#' @seealso \code{\link{catfullGenes}}
#' @seealso \code{\link{catmultGenes}}
#'
#' @examples \dontrun{
#' data(Gaya)
#' df <- catfullGenes(Gaya,
#' shortaxlabel = TRUE,
#' missdata = FALSE,
#' outgroup = "Abutilon_costicalyx")
#'
#' writePhylip(df,
#' file = "filename.phy",
#' genomics = FALSE,
#' catalignments = TRUE,
#' partitionfile = TRUE)
#' }
#'
#' @importFrom dplyr select
#' @importFrom magrittr "%>%"
#' @importFrom R.utils insert
#' @importFrom stringr str_extract_all
#' @importFrom tibble add_column
#' @importFrom tidyr unite
#' @importFrom utils write.table
#'
#' @export
writePhylip <- function(x, file,
genomics = FALSE,
catalignments = TRUE,
partitionfile = TRUE,
endgaps.to.miss = TRUE) {
datset <- .namedlist(x)
numberinputdatset <- length(datset)
# Unlist and rename an input list from genefullcomp function
if (numberinputdatset == 1) {
datset <- unlist(datset, recursive = FALSE)
names(datset) <- gsub(".*[.]", "", names(datset))
for (i in 1:length(datset)) {
datset[[i]]$species <- as.character(datset[[i]]$species)
datset[[i]]$sequence <- as.character(datset[[i]]$sequence)}
}
numberdatset <- length(datset)
if (numberdatset == 1) {
stop("You must provide a file generated by the genefullcomp
Find help also at DBOSLab-UFBA (Domingos Cardoso; cardosobot@gmail.com)")
}
genenames <- names(datset)
cat("You are combining the following gene datasets:", "", sep = "\n")
cat(cat(genenames, sep = ", "), "", sep = "\n")
cat(cat("Total number of datasets:", numberdatset), "", sep = "\n")
numbchar <- lapply(datset, function(x) nchar(x[1, 2]))
series_numbchar <- paste(lapply(numbchar, function(x) unlist(x)))
# Making all separate dataset within the list the same size to transforme into a dataframe with NAs whenever there is no sequence
#datasets <- lapply(datasets, `length<-`, max(lengths(datasets)))
ntax <- paste0(length(rownames(datset[[1]])))
nchartotal <- sum(unlist(numbchar))
dimensions <- paste(ntax, nchartotal, sep = " ")
cat("Your phylip-formatted concantenated dataset will have the following DIMENSIONS:", "", sep = "\n")
cat(paste(ntax, "TAXA"), sep = "\n")
cat(paste(nchartotal, "CHARACTERS"), sep = "\n")
# Finding identifiers that differ among the sequences and across genes
datset_temp <- datset
cf <- lapply(datset, function(x) grepl("_cf_", x[[1]]))
aff <- lapply(datset, function(x) grepl("_aff_", x[[1]]))
spp_temp <- lapply(datset, function(x) gsub("_aff_|_cf_", " ", x[[1]]))
infraspp <- lapply(spp_temp, function(x) grepl("[[:upper:]][[:lower:]]+_[[:lower:]]+_[[:lower:]]+", x))
if (any(unlist(cf))|any(unlist(aff))|any(unlist(infraspp))) {
nr <- .namesTorename(datset,
cf = cf,
aff = aff,
infraspp = infraspp)
# Adjusting species labels when they have cf. or aff.
# Adjusting species names with infraspecific taxa
datset_temp <- .adjustnames(datset_temp,
cf = cf,
aff = aff,
infraspp = infraspp)
}
spp_labels <- lapply(datset_temp, function(x) x[[1]])
# Check if there is any species with identifiers, then delete when
# they differ among sequences in each gene
spp_get <- list()
for (i in seq_along(spp_labels)) {
spp_get[[i]] <- grepl("_[[:lower:]]+_|_[[:lower:]]+[[:upper:]]_|_[[:lower:]]+[[:alnum:]]_", spp_labels[[i]])
}
if (any(unlist(spp_get))) {
if (genomics) {
ntrue <- lapply(spp_get, function(x) sum(x, na.rm = TRUE))
if (any(length(spp_get[[1]]) == ntrue)) {
n <- which(length(spp_get[[1]]) == ntrue)[1]
all_spp_labels <- spp_labels[[n]]
} else {
temp <- list()
for (i in seq_along(spp_get)){
temp[[i]] <- spp_get[[i]][!spp_get[[1]]]
}
ntrue <- lapply(temp, function(x) sum(x, na.rm = TRUE))
n <- which(sum(!spp_get[[1]], na.rm = TRUE) == ntrue)[1]
all_spp_labels <- spp_labels[[n]]
}
for (i in seq_along(datset)) {
datset[[i]][["species"]] <- all_spp_labels
}
} else {
accessions <- list()
difaccessions <- list()
#spp_get <- list()
for (i in seq_along(spp_labels)) {
# Remove all before second underscore
#spp_get[[i]] <- grepl("_[[:lower:]]+_", spp_labels[[i]])
accessions[[i]] <- gsub("^([^_]+_){2}(.+)$", "\\2",
spp_labels[[i]])
}
for (i in 1:(length(spp_labels)-1)) {
for (j in seq_along(spp_labels[[i]])) {
if (spp_labels[[1]][[j]] != spp_labels[[i+1]][[j]]) {
if (spp_get[[1]][[j]] == TRUE) {
spp_labels[[1]][[j]] <- gsub(paste0("_", accessions[[1]][[j]]),
paste0("_", gsub("_.*", "", accessions[[1]][[j]])),
spp_labels[[1]][[j]])
}
}
}
}
for (i in 2:length(spp_labels)) {
for (j in seq_along(spp_labels[[i]])) {
if (spp_labels[[i]][[j]] != spp_labels[[1]][[j]]) {
if (spp_get[[i]][[j]] == TRUE) {
spp_labels[[i]][[j]] <- gsub(paste0("_", accessions[[i]][[j]]),
paste0("_", gsub("_.*", "", accessions[[i]][[j]])),
spp_labels[[i]][[j]])
}
}
}
}
for (i in seq_along(datset)) {
datset[[i]][["species"]] <- spp_labels[[i]]
}
# Get number of missing data "?" in each sequence and then erase identifiers
# in non-duplicated species from all gene datasets
missdata <- vector()
for (j in seq_along(datset)) {
for (i in seq_along(datset[[j]][["species"]])) {
missdata[i] <- length(stringr::str_extract_all(datset[[j]][["sequence"]][i], "[?]", simplify = FALSE)[[1]])
}
datset[[j]] <- tibble::add_column(datset[[j]], missites = missdata, .after = "species")
}
datset_temp <- lapply(datset, .shortaxlabels)
dupp <- c(duplicated(datset_temp[[1]][,"species"], fromLast = TRUE) |
duplicated(datset_temp[[1]][,"species"]))
if(any(dupp)){
n <- vector()
for (j in seq_along(datset)) {
for (i in seq_along(datset[[j]][["species"]][!dupp])) {
#lapply(datset, function(x) x[["species"]][!dupp][i])
if (any(lapply(datset, function(x) x[["missites"]][!dupp][i]) %in%
lapply(datset, function(x) nchar(x[["sequence"]][1]))) &
!all(lapply(datset, function(x) x[["missites"]][!dupp][i]) %in%
lapply(datset, function(x) nchar(x[["sequence"]][1])))) {
n[i] <- as.vector(unlist(lapply(datset, function(x) gsub("(_[^_]+)_.*", "\\1", x[["species"]][!dupp][i]))[1]))
# I need to now how to subset a list of dataframes using a list of TRUE/FALSE
}
}
datset[[j]] <- datset[[j]] %>% select(c("species", "sequence"))
}
# Changing terminal names in just the first dataset, since we will not use names from the other datasets
g <- grepl(paste0(n[!is.na(n)], collapse = "|"), datset[[1]][["species"]][!dupp])
datset[[1]][["species"]][!dupp][g] <- gsub("(_[^_]+)_.*", "\\1", datset[[1]][["species"]][!dupp][g])
} else {
n <- gsub("(_[^_]+)_.*", "\\1", datset[[1]][["species"]])
for (j in 2:length(datset)) {
g <- datset[[j]][["species"]] %in% datset[[1]][["species"]]
datset[[1]][["species"]][!g] <- n[!g]
datset[[1]] <- datset[[1]] %>% select(c("species", "sequence"))
datset[[j]] <- datset[[j]] %>% select(c("species", "sequence"))
}
}
}
# Putting back the names under cf. and aff.
# Adjusting names with infraspecific taxa
datset <- .namesback(datset,
cf = cf,
aff = aff,
infraspp = infraspp,
rename_cf = nr[["rename_cf"]],
rename_aff = nr[["rename_aff"]],
rename_infraspp = nr[["rename_infraspp"]],
shortaxlabel = FALSE,
multispp = FALSE)
}
# Calculating the space between the taxon labels and corresponding DNA sequence for each dataset
# First calculate how many letters in each taxon in each dataset
letrs_taxlabs <- lapply(datset, function(x) nchar(x$species))
# Setting the maximum of space based on the largest taxonlabel
maxletrs_taxlabs <- lapply(letrs_taxlabs, function(x) max(x)+5) # Just increase this last number if we want to add more space
maxletrs_taxlabs <- max(unlist(maxletrs_taxlabs))
# We can also pad a string inside a list of dataframes by adding numbers or names at any position in the specific column
f_b <- function(x, y) {
for (i in 1:length(x$species)) {
x$species[i] <- paste0(x$species[i], paste0(rep(" ", y - nchar(x$species[i])),
collapse = ""))
}
x
}
datset <- lapply(datset, f_b,
y = maxletrs_taxlabs)
# Replace terminal GAPs into missing character (?)
if (endgaps.to.miss) {
datset <- lapply(datset, .replace_terminal_gaps)
}
# Uniting the two columns inside just the first dataframe
datset[[1]] <- tidyr::unite(datset[[1]], "sequences", colnames(datset[[1]]), sep = " ")
# Deleting the species collumn in the remaining dataset
for (i in 2:numberdatset){
datset[[i]] <- data.frame(sequences=datset[[i]][,2])
}
ex_datset <- do.call("cbind", datset)
names(ex_datset) <- paste("sequences", 1:length(ex_datset), sep = "")
# Uniting all columns
genes <- tidyr::unite(ex_datset, "sequences", sep = "")
genes$sequences <- as.character(genes$sequences)
cat("Concatenating the genes...", "", sep = "\n")
#n <- dim(genes)[1]
#genes <- genes[1:(n-3),] # To remove few paragraphs after the matrix block and before the "; END;"
colnames(genes) <- paste(dimensions,
sep = "\n")
if (catalignments) {
# Writing the concatenated dataset
zz <- file(file, "w")
write.table(genes, zz,
append = FALSE, quote = FALSE, sep = " ",
eol = "\n", na = "", dec = ".", row.names = FALSE,
col.names = TRUE)
close(zz)
}
# Creating the txt file for a mixed/partitioned model in RAxML
partitionnames <- paste0("gene", 1:length(genenames))
charsets <- paste(rep("DNA,", times = numberdatset), partitionnames, "= ")
# Calculating the end of each gene with a cumulative sum of each gene size
endgene <- paste(cumsum(series_numbchar))
# Calculating the begining of each gene
startgene <- lapply(as.numeric(endgene), function(x) x+1)
# Dropping last value in the list
startgene <- startgene[1:(length(startgene)-1)]
# Insert the number "1" in the first position of the list
startgene <- paste(R.utils::insert(startgene, ats = 1, values = 1))
partitions <- paste0(charsets,
startgene,
rep("-", times = numberdatset),
endgene,
sep = "\n")
partitions <- paste(partitions, collapse = "")
partitions <- data.frame(partitions)
if (partitionfile) {
# Writing the partition file for RAxML concatenated analysis
if (catalignments) {
zy <- file(paste0(sub("[.].*", "", file), "_partition_file.txt"), "w")
} else {
zy <- file(file, "w")
}
write.table(partitions, zy,
append = FALSE, quote = FALSE, sep = " ",
eol = "\n", na = "", dec = ".", row.names = FALSE,
col.names = FALSE)
close(zy)
}
cat("Gene concatenation is finished!", "",
sep = "\n")
if (!catalignments & partitionfile) {
cat("But you downloaded just the partition file...",
sep = "\n")
cat("Run the function again, making sure to have \"catalignments = TRUE\".",
sep = "\n")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.