##' This function takes an alignment (with partition information if
##' needed), a tree, and simulate datasets under a given model
##' according to the guide tree provided. Each partition is optimized
##' on the guide tree for all the model parameters (transition rates,
##' base frequencies, gamma parameter, and proportion of invariants).
##'
##' This function is used for its side effect, as it generates
##' replicated simulated datasets.
##' @title Generate Simulated alignements
##' @param algfile The original alignment file, can be in any format
##' supported by ape:::read.dna
##' @param partfile The partition file (at this stage only RAxML
##' partition files are supported)
##' @param tree A tree file in Newick format
##' @param model The model of molecular evolution to be used for
##' parameter estimation on each parameters (at this stage, only
##' \dQuote{\code{GTRGAMMA}} and \dQuote{\code{GTRGAMMAI}} are
##' supported).
##' @param nreps Number of replicated simulated datasets to generate.
##' @param formatin The alignment format needs to be a format
##' supported by ape::read.dna.
##' @param pathFiles Character string indicating the path where the
##' data files (\code{algfile}, \code{partfile}, \code{tree}) as well
##' as \code{seq-gen} are located. Default if current working
##' directory.
##' @param pathResults Character string indicating the path where the
##' files generated by this function will be written. Default is a
##' subdirectory of \code{pathFiles} named \code{simDataGrouped}.
##' @param raxmlCmd Which RAxML flavor do you want to use? (default
##' raxmlHPC-PTHREADS-SSE).
##' @param raxmlArg Additional arguments to be passed to RAxML.
##' @return Invisibly returns some (possibly) useful location of files
##' used and generated by this sequence.
##' @author Francois Michonneau
##' @importFrom chopper raxmlPartitionInfo
##' @importFrom chopper cutAlignment
##' @importFrom chopper removeEmptySeqs
##' @importFrom ape read.tree
##' @importFrom ape read.dna
##' @importFrom ape write.tree
##' @importFrom ape drop.tip
##' @export
generateAlignments <- function(algfile, partfile, tree, model, nreps,
formatin = "sequential", pathFiles = ".",
pathResults = file.path(pathFiles, "simDataGrouped"),
raxmlCmd = "raxmlHPC-PTHREADS-SSE3", raxmlArg = "-T7") {
## TODO
## - create tests to make sure that there are no files that match
## the regexpr used in the code before we start
## - possibility of changing seq-gen output format
owd <- getwd()
setwd(pathFiles)
stopifnot(file.exists("seq-gen"))
model <- match.arg(model, c("GTRGAMMA", "GTRGAMMAI"))
## Cut Alignment
chopper::cutAlignment(algfile=algfile, partfile=partfile, formatin=formatin,
format="sequential", colw=10000, colsep="")
## Find the individual partition files
locNmL <- names(chopper::raxmlPartitionInfo(partfile))
locNm <- paste(locNmL, collapse="|")
pttrn <- paste("_(", locNm, ")\\.phy$", sep="")
lAlg <- list.files(pattern=pttrn, path = dirname(algfile), full.names = TRUE)
constTr <- ape::read.tree(file=tree)
for (i in 1:length(lAlg)) {
algF <- lAlg[i]
outF <- paste(basename(algF), ".out", sep="")
testRun <- list.files(pattern=paste("RAxML_.+\\.\\Q", outF, "\\E$", sep=""))
if (length(testRun)) {
message("File already exist, no need to run RAxML again")
next
}
else {
## Remove empty sequences from individual loci
nRm <- chopper::removeEmptySeqs(algF, overwrite = TRUE,
formatin="sequential",
formatout="sequential",
colw = 10000000, colsep = "")
if (length(nRm)) {
message("\nFor ", algF, " these sequences were removed:",
cat(nRm, sep="\n"),"\n")
}
## Drop tips for missing sequence for each individual loci
tmpAlg <- ape::read.dna(file=algF, format="sequential")
toDrop <- constTr$tip.label[! constTr$tip.label %in% dimnames(tmpAlg)[[1]]]
tmpTr <- ape::drop.tip(constTr, toDrop)
ape::write.tree(tmpTr, file=paste(algF, "tree", sep=""))
## Estimate parameters for each partition using guide/constrained
## tree for each partition.
treF <- paste(algF, "tree", sep="")
cmd <- paste(raxmlCmd, "-s", algF, "-n", outF, "-m", model,
"-f e -t", treF, raxmlArg)
system(cmd)
}
}
## Generate Random sequences
lInfoFiles <- list.files(pattern="info(.+)out$")
for (i in 1:length(lInfoFiles)) {
execSeqGen(getParams(lInfoFiles[i]), nreps=nreps, outputFormat="relaxed",
pathSimData=pathResults)
}
resFiles <- list(lAlg = lAlg, lInfo = lInfoFiles)
setwd(owd)
invisible(resFiles)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.