R/generateAlignments.R

##' This function takes an alignment (with partition information if
##' needed), a tree, and simulate datasets under a given model
##' according to the guide tree provided. Each partition is optimized
##' on the guide tree for all the model parameters (transition rates,
##' base frequencies, gamma parameter, and proportion of invariants).
##'
##' This function is used for its side effect, as it generates
##' replicated simulated datasets.
##' @title Generate Simulated alignements
##' @param algfile The original alignment file, can be in any format
##' supported by ape:::read.dna
##' @param partfile The partition file (at this stage only RAxML
##' partition files are supported)
##' @param tree A tree file in Newick format
##' @param model The model of molecular evolution to be used for
##' parameter estimation on each parameters (at this stage, only
##' \dQuote{\code{GTRGAMMA}} and \dQuote{\code{GTRGAMMAI}} are
##' supported).
##' @param nreps Number of replicated simulated datasets to generate.
##' @param formatin The alignment format needs to be a format
##' supported by ape::read.dna.
##' @param pathFiles Character string indicating the path where the
##' data files (\code{algfile}, \code{partfile}, \code{tree}) as well
##' as \code{seq-gen} are located. Default if current working
##' directory.
##' @param pathResults Character string indicating the path where the
##' files generated by this function will be written. Default is a
##' subdirectory of \code{pathFiles} named \code{simDataGrouped}.
##' @param raxmlCmd Which RAxML flavor do you want to use? (default
##' raxmlHPC-PTHREADS-SSE).
##' @param raxmlArg Additional arguments to be passed to RAxML.
##' @return Invisibly returns some (possibly) useful location of files
##' used and generated by this sequence.
##' @author Francois Michonneau
##' @importFrom chopper raxmlPartitionInfo
##' @importFrom chopper cutAlignment
##' @importFrom chopper removeEmptySeqs
##' @importFrom ape read.tree
##' @importFrom ape read.dna
##' @importFrom ape write.tree
##' @importFrom ape drop.tip
##' @export
generateAlignments <- function(algfile, partfile, tree, model, nreps,
                               formatin = "sequential", pathFiles = ".",
                               pathResults = file.path(pathFiles, "simDataGrouped"),
                               raxmlCmd = "raxmlHPC-PTHREADS-SSE3", raxmlArg = "-T7") {
    ## TODO
    ## - create tests to make sure that there are no files that match
    ##   the regexpr used in the code before we start
    ## - possibility of changing seq-gen output format

    owd <- getwd()
    setwd(pathFiles)

    stopifnot(file.exists("seq-gen"))
    model <- match.arg(model, c("GTRGAMMA", "GTRGAMMAI"))

    ## Cut Alignment
    chopper::cutAlignment(algfile=algfile, partfile=partfile, formatin=formatin,
                          format="sequential", colw=10000, colsep="")

    ## Find the individual partition files
    locNmL <- names(chopper::raxmlPartitionInfo(partfile))
    locNm <- paste(locNmL, collapse="|")
    pttrn <- paste("_(", locNm, ")\\.phy$", sep="")
    lAlg <- list.files(pattern=pttrn, path = dirname(algfile), full.names = TRUE)

    constTr <- ape::read.tree(file=tree)
    for (i in 1:length(lAlg)) {
        algF <- lAlg[i]
        outF <- paste(basename(algF), ".out", sep="")
        testRun <- list.files(pattern=paste("RAxML_.+\\.\\Q", outF, "\\E$", sep=""))
        if (length(testRun)) {
            message("File already exist, no need to run RAxML again")
            next
        }
        else {
            ## Remove empty sequences from individual loci
            nRm <- chopper::removeEmptySeqs(algF, overwrite = TRUE,
                                            formatin="sequential",
                                            formatout="sequential",
                                            colw = 10000000, colsep = "")
            if (length(nRm)) {
                message("\nFor ", algF, " these sequences were removed:",
                        cat(nRm, sep="\n"),"\n")
            }

            ## Drop tips for missing sequence for each individual loci
            tmpAlg <- ape::read.dna(file=algF, format="sequential")
            toDrop <- constTr$tip.label[! constTr$tip.label %in% dimnames(tmpAlg)[[1]]]
            tmpTr <- ape::drop.tip(constTr, toDrop)
            ape::write.tree(tmpTr, file=paste(algF, "tree", sep=""))

            ## Estimate parameters for each partition using guide/constrained
            ## tree for each partition.
            treF <- paste(algF, "tree", sep="")
            cmd <- paste(raxmlCmd, "-s", algF, "-n", outF, "-m", model,
                         "-f e -t", treF, raxmlArg)
            system(cmd)
        }
    }

    ## Generate Random sequences
    lInfoFiles <- list.files(pattern="info(.+)out$")
    for (i in 1:length(lInfoFiles)) {
        execSeqGen(getParams(lInfoFiles[i]), nreps=nreps, outputFormat="relaxed",
                   pathSimData=pathResults)
    }

    resFiles <- list(lAlg = lAlg, lInfo = lInfoFiles)

    setwd(owd)
    invisible(resFiles)
}
fmichonneau/rsowh documentation built on May 16, 2019, 1:47 p.m.