R/concatenateAlignments.R

##' Concenate marker alignements to generate a single alignment.
##'
##' Given a series of files generated by \sQuote{mergeSeq}, this
##' function creates an alignment from these files. WARNING: the files
##' for the independent loci are assumed to be created using the
##' function mergeSeq, meaning that the loci is indicated in 3rd
##' position, separated by an hyphen e.g. 20120808-101010-COI.fas
##' @title Concatenate alignments
##' @param pattern character string to identify aligned files in folder
##' @param path path where the alignment files are to be found
##' @param output file name (with full path) for the concatenated file
##' @param input.format format of the input DNA alignment (to be
##' passed to ape:::read.dna)
##' @param partition if NULL no partition file is created, otherwise a
##' character string indicating the file name where to write the
##' partitions
##' @param partition.format which format should be used to create the
##' partition file (raxml or nexus)
##' @param create.conc if TRUE create the concatenated file, if FALSE
##' useful for testing (dry run) or to create partition file
##' @param standardize not needed if alignments created by
##' \sQuote{mergeSeq}. Otherwise the alignments names are standardized
##' for each of the individual markers by generating empty sequences.
##' @param drop a character vector indicating the sequences that need
##' to be removed from the output
##' @param ... further arguments to be passed to write.dna for the
##' output file
##' @return This function is used for its side effect of creating an
##' alignment from a set of alignemnts (marker based). It can also
##' create a partition file that can added at the end of the alignment
##' to specify where each marker starts and ends.
##' @author Francois Michonneau
##' @export
##' @examples
##' \dontrun{
##' concatenateAlignments(pattern="^20121017-112755.+afa$", path="/tmp/seq",
##'                       output="/tmp/seq/20121017.impatiens.fas",
##'                       partition="/tmp/seq/20121017.impatiens.part",
##'                       partition.format="nexus", colsep="", colw=10000)
##' }
concatenateAlignments <- function(pattern, path, output,
                                  input.format = "fasta",
                                  partition = NULL,
                                  partition.format = c("raxml", "nexus"),
                                  create.conc = TRUE,
                                  standardize=FALSE,
                                  drop = NULL, ...) {

    partition.format <- match.arg(partition.format)
    lAlg <- list.files(path = path, pattern = pattern)
    if (length(lAlg) < 2) {
        stop("only 1 file")
    }
    else {
        firstF <- lAlg[1]
        sizeAlg <- numeric(length(lAlg))
        firstAlg <- read.dna(file = file.path(path, lAlg[1]), format = input.format, as.character=TRUE)
        sizeAlg[1] <- dim(firstAlg)[2]
        if (standardize) {
            seqNm <- vector("list", length(lAlg))
            seqNm[[1]] <- dimnames(firstAlg)[[1]]
            for (i in 2:length(lAlg)) {
                tmpAlg <- read.dna(file = file.path(path, lAlg[i]), format = input.format)
                seqNm[[i]] <- dimnames(tmpAlg)[[1]]
            }
            seqNm <- unique(unlist(seqNm))
            missingInFirst <- setdiff(seqNm, dimnames(firstAlg)[[1]])
            if (length(missingInFirst)) {
                for (j in 1:length(missingInFirst)) {
                    firstAlg <- rbind(firstAlg, rep("-", dim(firstAlg)[2]))
                    dimnames(firstAlg)[[1]][length(dimnames(firstAlg)[[1]])] <- missingInFirst[j]
                }
            }
        }
        for (i in 2:length(lAlg)) {
            tmpAlg <- read.dna(file = file.path(path, lAlg[i]), format = input.format, as.character=TRUE)
            if (standardize) {
                missingNm <- setdiff(seqNm, dimnames(tmpAlg)[[1]])
                if (length(missingNm)) {
                    for (j in 1:length(missingNm)) {
                        tmpAlg <- rbind(tmpAlg, rep("-", dim(tmpAlg)[2]))
                        dimnames(tmpAlg)[[1]][length(dimnames(tmpAlg)[[1]])] <- missingNm[j]
                    }
                }
            }
            tmpAlg <- tmpAlg[match(dimnames(firstAlg)[[1]], dimnames(tmpAlg)[[1]]), ]
            firstAlg <- cbind(firstAlg, tmpAlg)
            sizeAlg[i] <- dim(tmpAlg)[2]
        }
        if(create.conc) {
            if (!is.null(drop)) {
                toRm <- match(drop, dimnames(firstAlg)[[1]])
                if (any(is.na(toRm))) {
                    stop("Some of the sequences specified in drop were not found.")
                }
                firstAlg <- firstAlg[-toRm, ]
            }
            write.dna(firstAlg, file = output, ...)
        }
        if (!is.null(partition)) {
            cat(character(0), file = partition, append=FALSE)
            locNm <- gsub("(.+)-(.+)-(.+)\\.(.+)", "\\3", lAlg)
            first <- 1
            if (partition.format == "nexus") {
                cat("BEGIN ASSUMPTIONS;\n", file = partition, append = TRUE)
            }
            for (i in 1:length(locNm)) {
                last <- first+sizeAlg[i]-1
                if (partition.format == "raxml") {
                    cat(paste("DNA,",locNm[i], "=", first, "-", last, "\n", sep = ""), file = partition, append = TRUE)
                }
                else if (partition.format == "nexus") {
                    cat(paste("    charset ", locNm[i], " = ", first, "-", last, ";\n", sep = ""), file = partition, append = TRUE)
                }
                first <- last+1
            }
            if (partition.format == "nexus") {
                cat("END;\n", file = partition, append = TRUE)
            }
        }
        TRUE
    }
}
fmichonneau/chopper documentation built on May 16, 2019, 1:43 p.m.