#' @title Create BEAST XML file
#'
#' @description Imports sequences from a file and places them in an XML file
#' generated by BEAUti for use with BEAST.
#'
#' @param input The name of the input file containing the alignment. Works with
#' .nex, .fasta and .phy files
#' @param datatype The type of data used in the alignment input file. Either
#' 'nucleotide', 'aminoacid', or 'binary'.
#' @param template The name of the BEAST input XML file to be used as a
#' template. This file can be produced by BEAUti
#' @param file The name of the file that the resulting XML file will be written
#' to.
#' @param dates The name of the tab delimited file containing tip date data.
#' @param location The name of the tab delimited file containing location data.
#' @param chainLength An integer determining the length of the MCMC chain. If
#' not stated will stay as the value specified in the original template XML
#' file.
#' @param storeEvery An integer determining how often to store the state. If not
#' stated will stay as the value specified in the original template XML file.
#'
#' @export
convert_XML <- function(input, datatype, template, file, dates = NULL, location = NULL, chainLength = NULL, storeEvery = NULL) {
import_sequences <- function(file) {
## Import .nex file ##
import_nex <- function(file) {
# Read file and comvert to dataframe
dataframe <- as.data.frame(scan(file, character(), quiet = TRUE))
# Find where the sequences start
start.line <- grep("matrix", dataframe[, 1], ignore.case = TRUE) + 1
# Find where the sequences end
end.line <- grep("end", dataframe[start.line:length(dataframe[, 1]), 1], ignore.case = TRUE)[1] - 3 +
start.line
# Length of the sequence data
l <- end.line - start.line
# Find the number of sequences
n.tax <- utils::type.convert(gsub(";", "", gsub("ntax=", "", dataframe[grep("ntax=", dataframe[, 1],
ignore.case = TRUE), ], ignore.case = TRUE)))
# ID of the sequences
seq.name <- dataframe[(1:(n.tax)) * 2 + start.line - 2, ]
# Create a list of all the sequences
seq.text <- list()
for (n in 1:n.tax) {
sequence <- paste(dataframe[(1:((l + 1)/(2 * n.tax))) * 2 * n.tax - (2 * n.tax - start.line + 1 -
2 * n), ], collapse = "")
seq.text <- rbind(seq.text, sequence)
}
# Combine ID and sequences into a data frame
nex_data <- suppressWarnings(cbind.data.frame(seq.name, seq.text))
# Return data frame
nex_data
}
# import fasta files
import_fasta <- function(file) {
phylotools::read.fasta(file, clean_name = FALSE)
}
# import phylic files
import_phy <- function(file) {
phylotools::read.phylip(file, clean_name = FALSE)
}
# Find the apporpraite function for the filetype
if (grepl("*.nex$", file) == TRUE) {
data <- import_nex(file)
} else {
if (grepl("*.fasta$", file) == TRUE) {
data <- import_fasta(file)
} else {
if (grepl("*.phy$", file) == TRUE) {
data <- import_phy(file)
} else {
stop("error")
}
}
}
if (length(unique(nchar(as.vector(data[, 2])))) != 1)
stop("Inconsistent sequence lengths")
# remove apostrophes from names
data[, 1] <- gsub("'", "", data[, 1])
# remove NA from end of line
data[, 2] <- gsub("NA$", "", data[, 2])
data
}
# Import sequence data from specified file
data <- import_sequences(input)
# Create a template XML file from specified file
if (grepl("*.xml$", template, ignore.case = TRUE) == TRUE) {
XML_template <- XML::xmlInternalTreeParse(template) #imports xml into internal tree
XML_template <- XML::xmlRoot(XML_template) #converts imported tree into a workable node
} else {
stop("Template is not an XML file")
}
data_attrs <- XML::xmlAttrs(XML_template[["data"]]) #attributes of the node 'data'
data_node <- XML::newXMLNode("data") #creates new node
XML::xmlAttrs(data_node) <- data_attrs #writes attributes to new node
XML::xmlAttrs(data_node)["dataType"] <- datatype
XML::removeChildren(XML_template, "data") #removes node 'data' and all sequences
XML::addChildren(XML_template, data_node, at = 0) #adds new node
# Function to add sequences to an XML node
edit_Sequence <- function(sequence, ...) {
args <- list(...)
id <- args[1]
taxon <- args[2]
totalcount <- args[3]
value <- args[4]
XML::xmlAttrs(sequence)["id"] <- id
XML::xmlAttrs(sequence)["taxon"] <- taxon
XML::xmlAttrs(sequence)["totalcount"] <- totalcount
XML::xmlAttrs(sequence)["value"] <- value
}
if (datatype == "nucleotide") {
count <- 4
} else {
if (datatype == "aminoacid") {
count <- 20
} else {
if (datatype == "binary") {
count <- 2
} else {
stop("Invalid datatype")
}
}
}
# add new sequences to template
for (n in 1:(length(data[, 2]))) {
sequence <- XML::newXMLNode("sequence")
edit_Sequence(sequence, paste("seq_", data[n, 1], sep = ""), paste(data[n, 1]), count, paste(data[n, 2]))
XML::addChildren(XML_template[["data"]], sequence)
}
# Paste dates in template XML file
if (is.null(dates) == FALSE) {
if (mode(XML_template[["run"]][["state"]][["tree"]][["trait"]][["text"]]) != "NULL") {
# Read date data file
dates_data <- utils::read.delim(dates, header = FALSE)
# Paste date data in template XML file
dates_text <- paste(dates_data[1:(length(dates_data[, 1]) - 1), 1], "=", dates_data[1:(length(dates_data[,
2]) - 1), 2], ",\n", sep = "", collapse = "")
dates_text <- paste(dates_text, paste(dates_data[length(dates_data[, 1]), 1], "=", dates_data[length(dates_data[,
2]), 2], "\n", sep = "", collapse = ""), sep = "")
XML_template[["run"]][["state"]][["tree"]][["trait"]][["text"]] <- dates_text
} else {
stop("Template XML does not allow date input")
}
} else {
if (mode(XML_template[["run"]][["state"]][["tree"]][["trait"]][["text"]]) != "NULL") {
dates_text <- paste(data[1:(length(data[, 1]) - 1), 1], "=", "0", ",\n", sep = "", collapse = "")
dates_text <- paste(dates_text, paste(data[length(data[, 1]), 1], "=", "0", "\n", sep = "", collapse = ""),
sep = "")
XML_template[["run"]][["state"]][["tree"]][["trait"]][["text"]] <- dates_text
}
}
# Import location data
if (is.null(location) == FALSE) {
if (mode(XML_template[["run"]][["distribution"]][[2]][[2]][["data"]][["traitSet"]][["text"]]) == "NULL") {
stop("Template XML does not allow location input")
} else {
# Read location data file
loc_data <- utils::read.delim(location, header = FALSE)
# Paste location data in template XML file
loc_text <- paste(loc_data[1:(length(loc_data[, 1]) - 1), 1], "=", loc_data[1:(length(loc_data[, 2]) -
1), 2], ",\n", sep = "", collapse = "")
loc_text <- paste(loc_text, paste(loc_data[length(loc_data[, 1]), 1], "=", loc_data[length(loc_data[,
2]), 2], "\n", sep = "", collapse = ""), sep = "")
XML_template[["run"]][["distribution"]][[2]][[2]][["data"]][["traitSet"]][["text"]] <- loc_text
# Edit XML attributes
loc_names <- unique(sort(loc_data[, 2]))
loc_count <- length(loc_names)
codeMap_attr <- paste(paste(loc_names, "=", 1:loc_count - 1, ",", collapse = "", sep = ""), "? =",
paste(1:loc_count - 1, collapse = " "))
XML::xmlAttrs(XML_template[["run"]][["distribution"]][[2]][[2]][["data"]][["userDataType"]])["codeMap"] <- codeMap_attr
XML::xmlAttrs(XML_template[["run"]][["distribution"]][[2]][[2]][["data"]][["userDataType"]])["states"] <- loc_count
}
}
# change MCMC chainlength
if (is.null(chainLength) == FALSE) {
XML::xmlAttrs(XML_template[["run"]])["chainLength"] <- as.integer(chainLength)
}
# Change store every
if (is.null(storeEvery) == FALSE) {
XML::xmlAttrs(XML_template[["run"]][["state"]])["storeEvery"] <- as.integer(storeEvery)
}
## Replace Names-----------------------------------------------------------------------------------------------
# Extract file name from saved filepath
file_name <- tools::file_path_sans_ext(gsub(".*/", "", file))
# Convert to character string
XML_doc <- XML::toString.XMLNode(XML_template)
# Change .trees output name
XML_doc <- gsub("fileName=(.{,30})\\.trees", paste("fileName=\\\"", file_name, ".trees", sep = ""), XML_doc)
# Change .log output name
XML_doc <- gsub("fileName=(.{,30})\\.log", paste("fileName=\\\"", file_name, ".log", sep = ""), XML_doc)
## Save
## file-----------------------------------------------------------------------------------------------------
write(XML_doc, file = file)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.