R/mercator.R

Defines functions mercator gbk2gff guide_tree match_genes find_breakpoints fsaAlignSegmentDirs alignSegmentDir

Documented in mercator

#' @importFrom ape read.dna cbind.DNAbin dist.dna clustal muscle tcoffee nj bionj fastme.bal write.tree labels.DNAbin
#' @importFrom parallel detectCores mcmapply
NULL

#' Run mercator to build a homology map and create orthologous segments
#' that can be aligned using FSA
#' 
#' @details
#' \code{mercator} will create a hidden directory \sQuote{.mercator}
#' in the parent directory of the submitted sequence files and place
#' all results of the intermediate computational steps in this directory.
#' 
#' The results of a \code{mercator} run are stored in a directory
#' \sQuote{segments} in the parent directory of the submitted sequence files.
#' The \sQuote{segments} directory should contain numbered subdirectories
#' containing orthologous genomic segments in multi fasta format, a file
#' named \sQuote{map}, and a file named \sQuote{genomes}.
#' 
#' \code{mercator} segements can be aligned using the
#' \code{\link{alignSegments}} function, merged into a single multi-fasta
#' alignment file and imported into R as a \code{\linkS4class{DNAStringSet}}
#' using \code{\link{importAlignment}} or imported to R as an
#' \code{\linkS4class{annotatedAlignment}} using \code{\link{annotatedAlignment}}.
#' 
#' @param seq_files Path to sequence files ('fasta' or 'genbank').
#' @param anno_files (Optional) Path to annotation files. If no
#' annotation files are provided an \emph{ab initio} annotation
#' using \code{\link{glimmer3}} is performed.
#' @param anno_type Type of annotation ('glimmer3', 'genbank', 'gff', 
#' 'ptt', or 'ftable').
#' @param glimmeropts Options passed to \code{\link{glimmer3}} if an
#' \emph{ab initio} annotation is performed.
#' @param removeOverlappingCDS Exclude overlapping CDS for orthology mapping.
#' @param mask Softmask sequence before aligning using
#' \code{\link{maskSequence}}.
#' @param wd Working directory. Defaults to the parent directory of
#' the provided sequence files.
#' @seealso \code{\link{alignSegments}}, \code{\link{importAlignment}},
#' \code{\link{annotatedAlignment}}.
#' @export
mercator <- function(seq_files, anno_files = NULL, anno_type = "glimmer3",
                     glimmeropts = list(o=50, g=110, t=30), removeOverlappingCDS = TRUE,
                     mask = TRUE, wd = NULL) {
  annotation <- match.arg(anno_type, c("glimmer3", "genbank", "gbk", "gff", "ptt",
                                       "ftable"))
  ## Check dependencies for BLAT
  has_dependencies(c("sdbList", "gffRemoveOverlaps", "gff2anchors", "anchors2fa",
                     "blat", "blat2hits"))
  ## Check dependencies for mercator
  has_dependencies(c("fa2sdb", "mercator", "sdbAssemble", "phits2constraints",
                     "makeAlignmentInput", "sdbExport", "muscle", "omap2hmap",
                     "makeBreakpointGraph", "makeBreakpointAlignmentInput",
                     "findBreakpoints", "breakMap", "hmap2omap", "omap2coordinates"))
  # if not specified set the working directory to the parent directory of all
  # sequence files
  if (is.null(wd)) {
    wd <- Reduce(function(lhs, rhs) compactNA(rhs[match(lhs, rhs)]),
                 strsplit(dirname(seq_files), .Platform$file.sep))
    wd <- normalizePath(paste(wd, collapse=.Platform$file.sep))
  }
  if (!all(is_fasta(seq_files) | is_gbk(seq_files))) {
    stop("Sequences must be provided in FASTA or GenBank format")
  }
  ## Extract FASTA from GenBank files and reassign the gbk files as anno_files
  ## if annotation = 'genbank' and anno_files = NULL
  gbk_files <- which(is_gbk(seq_files))
  if (length(gbk_files) > 0) {
    if ((annotation == 'genbank' || annotation == 'gbk') && is.null(anno_files)) {
      anno_files <- seq_files[gbk_files]
    }
    outdirs <- dirname(seq_files)
    seq_files[gbk_files] <- gbk2fna(seq_files[gbk_files], outdirs[gbk_files])
  }
  # mask sequence files
  if (mask) {
    seq_files <- maskSequence(normalizePath(seq_files))
  }
  seq_files <- normalizePath(seq_files)
  
  # generate annotation if necessary
  if (is.null(anno_files)) {
    if (annotation != "glimmer3")
      stop("No annotation files provided")
    else
      anno_files <- glimmer3(normalizePath(seq_files))
  }
  anno_files <- normalizePath(anno_files)

  if (length(anno_files) != length(seq_files))
    stop("Unequal number of sequence and annotation files")
  
  if (any(strip_ext(basename(anno_files)) %ni% strip_ext(basename(seq_files))))
    stop("Names of annotation and sequence files must match.")
  
  merc <- file.path(wd, ".mercator")
  if (file.exists(merc)) {
    warning("A '.mercator' directory exists in ", wd, immediate.=TRUE)
    ans <- readline("Overwrite [y/n]? ")
    if (ans == "y") {
      unlink(merc, recursive=TRUE)
    } else {
      return(NULL)
    }
  }
  
  merc_gff <- file.path(merc, "gff")
  merc_fas <- file.path(merc, "fasta")
  merc_sdb <- file.path(merc, "sdb")
  merc_hit <- file.path(merc, "hits")
  for (dir in c(merc_gff, merc_fas, merc_sdb, merc_hit)) {
    create_if_not_exists(dir, type="dir", recursive=TRUE)
  }
  # generate mercator-readable gff files
  gff_files <- gff_for_mercator(anno_files, annotation, merc_gff)
  
  # generate mercator-readable fasta files
  fna_files <- fna_for_mercator(seq_files, merc_fas)
  
  sdb_files <- file.path(merc_sdb, replace_ext(basename(fna_files), "sdb"))
  invisible(mapply(function(x, y) {
    system(paste0("fa2sdb ", x, " < ", y))
  }, x = sdb_files, y = fna_files))
  
  # compare all of the exon sequences pairwise and generate the proper
  # input files for Mercator.
  reciprocal_blat(genomes = strip_ext(basename(fna_files)),
                  merc_sdb = merc_sdb, merc_gff = merc_gff, merc_out = merc_hit,
                  sep = "-", removeOverlappingCDS = removeOverlappingCDS)
  
  # run Mercator on the input files.
  # This generates a directory 'segments' with subdirectories
  # containing the sequences to be aligned for each orthologous segment set     
  # identified by the orthology map.
  segment_dir <- run_mercator(wd)
  
  message("Next use 'alignSegments()' to generate alignments for each of the orthologous segments produced by mercator")
  return(segment_dir)
}


gff_for_mercator <- function (f, type, wd) {
  if (missing(wd)) {
    stop("No working directory provided")
  }
  type <- match.arg(type, c("gff", "ptt", "genbank", "gbk", "ftable","glimmer3"))
  out <- switch(type,
                gff      = gff2gff(f, wd),
                ptt      = ptt2gff(f, wd),
                genbank  = gbk2gff(f, wd),
                gbk      = gbk2gff(f, wd),
                ftable   = ftb2gff(f, wd),
                glimmer3 = glimmer2gff(f, wd))
  return(invisible(out))
}


gff2gff <- function (f, wd) {
  outfiles <- character()
  wd <- if (length(wd) == 1) rep(wd, length(f)) else wd
  for (i in seq_along(f)) {
    l <- readLines(f[i])
    nlines <- -1L
    if (any(l == "##FASTA")) {
      nlines <- which(l == "##FASTA")[1L] - 1L
    }
    rm(l) # free memory
    
    # load data frame
    gff <- scan(f[i], nlines = nlines, sep = "\t", comment.char = "#", 
                quote = "", quiet = TRUE, na.strings = '.',
                what = list(seqid = character(),
                            source = character(),
                            type = character(),
                            start = integer(),
                            end = integer(),
                            score = numeric(),
                            strand = character(),
                            phase = integer(),
                            attributes = character()))
    
    cds_idx <- which(gff$type == "CDS")
    len <- length(cds_idx)
    seqid <- strip_ext(basename(f[i]))
    gff <- data.frame(stringsAsFactors=FALSE,
                      seqid = rep(seqid, len), 
                      source = gff[["source"]][cds_idx],
                      type = gff[["type"]][cds_idx],
                      start = gff[["start"]][cds_idx],
                      end = gff[["end"]][cds_idx],
                      score = rep(".", len),
                      strand = gff[["strand"]][cds_idx],
                      phase = rep("0", len),
                      attributes = gff[["attributes"]][cds_idx])
    outfile <- file.path(wd[i], paste0(seqid, ".gff"))
    write.table(gff, outfile, quote=FALSE, sep="\t", row.names=FALSE, 
                col.names=FALSE)
    outfiles <- c(outfiles, outfile)
    rm(gff) ## free memory
  }
  
  return(invisible(outfiles))
}


ptt2gff <- function (f, wd) {
  outfiles <- character()
  wd <- if (length(wd) == 1) rep(wd, length(f)) else wd
  for (i in seq_along(f)) {
    skip <- sum(count.fields(f[i], sep="\t") < 9)
    ptt <- scan(f[i], skip = skip + 1, quote="",
                quiet=TRUE, sep="\t",
                what = list(Location = character(),
                            Strand = character(),
                            Length = character(),
                            PID = character(),
                            Gene = character(),
                            Synonym = character(),
                            Code = character(),
                            COG = character(),
                            Product = character()))
    
    seqid <- strip_ext(basename(f[i]))
    loc <- strsplit(ptt[["Location"]], "..", fixed=TRUE)
    len <- length(loc)
    gff <- data.frame(stringsAsFactors=FALSE,
                      seqid = rep(seqid, len), 
                      source = rep("ptt", len),
                      type = rep("CDS", len),
                      start = vapply(loc, "[", 1L, FUN.VALUE=character(1)),
                      end = vapply(loc, "[", 2L, FUN.VALUE=character(1)),
                      score = rep(".", len),
                      strand = ptt[["Strand"]],
                      phase = rep("0", len),
                      attributes = paste0("ID=cds", seq.int(0, len - 1),
                                          ";product=", ptt[["Product"]]))
    outfile <- file.path(wd[i], paste0(seqid, ".gff"))
    write.table(gff, outfile, quote=FALSE, sep="\t", row.names=FALSE, 
                col.names=FALSE)
    outfiles <- c(outfiles, outfile)
    rm(ptt,gff) ## free memory
  }
  
  return(invisible(outfiles))
}


ftb2gff <- function (f, wd) {
  outfiles <- character()
  wd <- if (length(wd) == 1) rep(wd, length(f)) else wd
  for (i in seq_along(f)) {
    l <- readLines(f[i], n=1)
    seqid <- strsplitN(l, split="\\s+", 2)
    m <- regexpr("[A-Za-z]{2}([A-Za-z_])?\\d+(\\.\\d)?", seqid)
    seqid <- strip_ext(regmatches(seqid, m))
    
    ft <- scan(f[i], sep="\t", comment.char=">", quiet=TRUE,
               quote="", fill=TRUE,
               what=list(start = character(),
                         end = character(),
                         key = character(),
                         qualifier = character(),
                         value = character()))
    
    pos_idx <- which(nzchar(ft[["start"]]))
    gene_idx <- pos_idx[ft[["key"]][pos_idx] == "CDS"]
    start <- as.integer(gsub('<|>', '', ft[["start"]][gene_idx]))
    end <- as.integer(gsub('<|>', '', ft[["end"]][gene_idx]))
    strand <- ifelse(start > end, '-', '+')
    tmp_start <- ifelse(strand == '+', start, end)
    end <- ifelse(strand == '+', end, start)
    start <- tmp_start
    
    len <- length(start)
    gff <- data.frame(stringsAsFactors=FALSE,
                      seqid = rep(seqid, len), 
                      source = rep("ftb", len),
                      type = rep("CDS", len),
                      start = start,
                      end = end,
                      score = rep(".", len),
                      strand = strand,
                      phase = rep("0", len),
                      attributes = paste0("ID=cds", seq.int(0, len - 1)))
    outfile <- file.path(wd[i], paste0(seqid, ".gff"))
    write.table(gff, outfile, quote=FALSE, sep="\t", row.names=FALSE, 
                col.names=FALSE)
    outfiles <- c(outfiles, outfile)
    rm(ftb,gff) ## free memory
  }
   
  return(invisible(outfiles))
}


gbk2gff <- function(f, wd) {
  outfiles <- character(0)
  wd <- if (length(wd) == 1) rep(wd, length(f)) else wd
  for (i in seq_along(f)) {
    gbk <- readLines(f[i])
    seqid <- strsplit(gbk[1], split = "\\s+")[[1]][2]
    outfile <- file.path(wd[i], paste0(seqid, ".gff"))
    features_start <- which(substr(gbk, 1, 8) == "FEATURES") + 1  
    features_end <- which(grepl("ORIGIN|CONTIG", substr(gbk, 1, 6))) - 1 %||% length(gbk) - 1
    features <- gbk[features_start:features_end]
    cds_idx <- which(substr(features, 6, 8) == "CDS")
    loc <- trim(strsplitN(features[cds_idx], "CDS", 2))
    ## discard joins
    loc <- grep("join", loc, value=TRUE, invert=TRUE)
    strand <- ifelse(grepl("complement", loc), '-', '+')
    loc <- strsplit(regmatches(loc, regexpr('<?\\d+\\.\\.>?\\d+', loc)),
                    split="..", fixed=TRUE)
    start <- gsub('<|>', '', vapply(loc, `[`, 1, FUN.VALUE=character(1)))
    end <- gsub('<|>', '', vapply(loc, `[`, 2, FUN.VALUE=character(1)))
    len <- length(loc)
    gff <- data.frame(stringsAsFactors=FALSE,
                      seqid = rep(seqid, len), 
                      source = rep("genbank", len),
                      type = rep("CDS", len),
                      start = start,
                      end = end,
                      score = rep(".", len),
                      strand = strand,
                      phase = rep("0", len),
                      attributes = paste0("ID=cds", seq.int(0, len - 1)))
    outfile <- file.path(wd[i], paste0(seqid, ".gff"))
    write.table(gff, outfile, quote=FALSE, sep="\t", row.names=FALSE, 
                col.names=FALSE)
    outfiles <- c(outfiles, outfile)
    rm(gbk, gff) ## free memory
  }
  return(invisible(outfiles))
}


glimmer2gff <- function (f, wd) {
  outfiles <- character()
  wd <- if (length(wd) == 1) rep(wd, length(f)) else wd
  for (i in seq_along(f)) {
    glim <- scan(f[i], comment.char = ">", quote="",
                quiet=TRUE, sep="",
                what = list(id = character(),
                            start = integer(),
                            end = integer(),
                            frame = character(),
                            score = numeric()))
    seqid <- strip_ext(basename(f[i]))
    len <- length(glim[["id"]])
    dir <- glim[["end"]] > glim[["start"]]
    start <- ifelse(dir, glim[["start"]], glim[["end"]])
    end <- ifelse(dir, glim[["end"]], glim[["start"]])
    strand <- ifelse(dir, "+", "-")
    
    max_gene_len <- ceiling((max(end) - min(start))*0.9)
    idx <- end - start > max_gene_len
    gff <- data.frame(stringsAsFactors=FALSE,
                      seqid = rep(seqid, len)[!idx], 
                      source = rep("glimmer", len)[!idx],
                      type = rep("CDS", len)[!idx],
                      start = start[!idx],
                      end = end[!idx],
                      score = rep(".", len)[!idx],
                      strand = strand[!idx],
                      phase = rep("0", len)[!idx],
                      attributes = paste0("ID=", glim[["id"]],
                                          ";frame=", glim[["frame"]],
                                          ";score=", glim[["score"]])[!idx])
    
    outfile <- file.path(wd[i], paste0(seqid, ".gff"))
    write.table(gff, outfile, quote=FALSE, sep="\t", row.names=FALSE, 
                col.names=FALSE)
    outfiles <- c(outfiles, outfile)
  }
  
  return(invisible(outfiles))
}


fna_for_mercator <- function (f, wd) {
  if (missing(wd)) {
    stop("No working directory provided")
  }
  type <- if (all(is_fasta(f))) {
    "fna"
  } else if (all(is_gbk(f))) {
    "gbk"
  } else {
    "other"
  }
  out <- switch(type,
                fna=fna2fna(f, wd),
                gbk=gbk2fna(f, wd),
                other=stop("Sequence files must be either in FASTA or GBK format"))
  
  return(invisible(out))
}


fna2fna <- function (f, wd) {
  outfiles <- character()
  wd <- if (length(wd) == 1) rep(wd, length(f)) else wd
  for (i in seq_along(f)) {
    fna <- readLines(f[i])
    seqid <- strip_ext(basename(f[i]))
    outfile <- file.path(wd[i], paste0(seqid, ".fna"))
    outcon <- file(outfile, open="w")
    fna[1] <- sprintf(">%s", seqid)
    writeLines(fna, outcon)
    close(outcon)
    outfiles <- c(outfiles, outfile)
  }
  return(invisible(outfiles))
}


gbk2fna <- function (f, wd) {
  outfiles <- character()
  wd <- if (length(wd) == 1) rep(wd, length(f)) else wd
  for (i in seq_along(f)) {
    gbk <- readLines(f[i])
    seqid <- strsplit(gbk[1], split = "\\s+")[[1]][2]
    header <- sprintf(">%s", seqid)
    outfile <- file.path(wd[i], paste0(seqid, ".fna"))
    outcon <- file(outfile, open="w")
    writeLines(header, outcon)
    ori_start <- which(substr(gbk, 1, 6) == "ORIGIN") + 1
    ori_end <- which(substr(gbk, 1, 2) == "//") - 1
    gbk <- gbk[seq.int(ori_start, ori_end)]
    fna <- vapply(gbk, function(x) {
      toupper(paste0(substr(x, 11, 20), substr(x, 22, 31),
                     substr(x, 33, 42), substr(x, 44, 53),
                     substr(x, 55, 64), substr(x, 66, 75),
                     collapse = ""))
    }, character(1), USE.NAMES = FALSE)
    writeLines(fna, outcon)
    close(outcon)
    outfiles <- c(outfiles, outfile)
  }
  return(invisible(outfiles))
}


run_mercator <- function (wd) {
  merc     <- file.path(wd, ".mercator")
  merc_fna <- file.path(merc, "fasta")
  merc_sdb <- file.path(merc, "sdb")
  merc_hit <- file.path(merc, "hits")
  merc_out <- file.path(merc, "out")
  dir.create(merc_out)
  genomes <- strip_ext(dir(merc_fna))
  system(sprintf("mercator -i %s -o %s %s", 
                 merc_hit, merc_out, paste(genomes, collapse=' ')))
  
  # comparative scaffolding of genome sequences
  sdb_files <- dir(merc_sdb, full.names=TRUE)
  for (sdb in sdb_files) {
    system(sprintf("sdbAssemble %s %s < %s", sdb,
                   file.path(merc_out, basename(sdb)),
                   file.path(merc_out, paste0(strip_ext(basename(sdb)), ".agp"))))
  }
  
  # Run phits2constraints to generate the "constraints" file
  system(sprintf("phits2constraints -i %s -m %s < %s > %s",
                 merc_hit, merc_out,
                 file.path(merc_out, "pairwisehits"),
                 file.path(merc_out, "constraints")))
    
  # Generate a guide tree in Newick format
  guide_tree(wd)
  
  # refine breakpoints
  find_breakpoints(wd)
  
  # generate alignment input for FSA
  map_path <- dir(merc_out, "\\<better\\.map\\>")
  segment_dir <- file.path(wd, "segments")
  if (file.exists(segment_dir)) {
    unlink(segment_dir, recursive=TRUE)
  }
  dir.create(segment_dir)
  system(sprintf("makeAlignmentInput --map=%s %s %s",
                 map_path, merc_out, segment_dir))
  
  return(invisible(segment_dir))
}


guide_tree <- function(wd) {
  merc <- file.path(wd, ".mercator")
  merc_hits <- file.path(merc, "hits")
  merc_out <- file.path(merc, "out")
  # check if all necessary files are present  
  f <- dir(merc_out, full.names=TRUE)
  if (sum(runs_pos <- grepl(pattern="\\<runs\\>", basename(f))) != 1L) {
    stop("'runs' file missing")
  }
  if (!any(grepl(pattern="\\.sdb\\>", basename(f)))) {
    stop("sdb-files missing")
  }
  merc_tree <- file.path(merc_out, "tree")
  create_if_not_exists(merc_tree, type="dir")
  
  runs <- read.table(f[runs_pos], header=FALSE, colClasses="integer")
  runs <- runs[complete.cases(runs),]
  names(runs) <- scan(f[grepl("genomes$", f)], what="character", quiet=TRUE)
  
  if (nrow(runs) == 0) {
    stop("There are no orthologous genes to generate a guide tree")
  }
  if (nrow(runs) > 20) {
    idx <- sample(seq_len(nrow(runs)), 20, replace = FALSE)
    runs <- runs[idx,]
  } 

  match_genes(runs, merc)
  
  tree <- make_tree(merc_tree, "muscle", "K80", "nj")
  ape::write.tree(tree, file=file.path(merc_out, "treefile"))
  
  return(invisible(tree))
}


match_genes <- function(runs, merc) {
  
  merc_out <- file.path(merc, "out")
  merc_hits <- file.path(merc, "hits")
  tree_dir <- file.path(merc_out, "tree")
  genomes <- names(runs)
  
  anchor <- list()
  for (i in seq_along(genomes))
    anchor[[i]] <- read.table(file.path(merc_hits, paste0(genomes[i], ".anchors")),
                              sep="\t")
  
  sdbs <- dir(merc_out, "\\.sdb$", full.names=TRUE)
  for (i in seq_along(genomes)) {
    gene_idx <- match(runs[,i], anchor[[i]][,1])
    seqname <- as.character(anchor[[i]][gene_idx, 2])
    strand <- as.character(anchor[[i]][gene_idx, 3])
    start <- anchor[[i]][gene_idx, 4]
    end <- anchor[[i]][gene_idx, 5]
    fasta <- list()
    for (j in seq_along(gene_idx)) {
      cat(system(sprintf("sdbExport -r --name=%s %s %s %s %s %s",
                         paste0(genomes[i], ":", runs[j,i]),
                         sdbs[i], seqname[j], start[j], end[j],
                         strand[j]),
                 intern=TRUE),
          file=file.path(tree_dir, paste0("orf", j, ".fa")),
          sep="\n", append=TRUE)
    }
  }
  
  return(invisible(TRUE))
}


#' [INTERNAL] conctruct NJ or BIONJ tree from one or more multi-fasta files
#' 
#' @param fasta_dir Directory containing (a) multi-FASTA file(s).
#' @param align One of 'clustal', 'muscle', or 'tcoffee'.
#' @param dist.model See \code{\link[ape]{dist.dna}}
#' @param tree One of 'nj', 'bionj', or 'fastme'.
#' 
#' @keywords internal
make_tree <- function (fna_dir, align="muscle", dist.model="K80", tree="bionj") {
  
  fna <- dir(fna_dir, "\\.fa$", full.names=TRUE)
  align.fun <- match.fun(match.arg(align, c('muscle', 'clustal', 'tcoffee')))
  tree.fun <- match.fun(match.arg(tree, c('bionj', 'nj', 'fastme.bal')))
  
  alignment <- list()
  for (i in seq_along(fna)) {
    dna <- read.dna(fna[i], format="fasta")
    alignment[[i]] <- align.fun(dna)
    cat(sprintf("Aligning %s ...\n", basename(fna[i])))
  }
  
  o <- NULL
  for (aln in alignment) {
    o <- c(o, list(order(labels.DNAbin(aln))))
  }
  for (i in seq_along(alignment)) {
    alignment[[i]] <- alignment[[i]][o[[i]],]
  }
  
  concat_aln <- do.call("cbind.DNAbin", c(alignment, check.names=FALSE))
  rownames(concat_aln) <- 
    sapply(strmatch(pattern="[^>].[^:]+", rownames(concat_aln), capture=FALSE),
           "[", 1)

  dist <- dist.dna(concat_aln, model=dist.model)
  tree <- tree.fun(dist)
  return(tree)
}

find_breakpoints <- function(wd) {
  
  merc <- file.path(wd, ".mercator")
  merc_out <- file.path(merc, "out")
  merc_break <- file.path(merc_out, "breakpoints")
  dir.create(merc_break)
  
  f <- dir(merc_out, full.names=TRUE)
  if (sum(map_pos <- grepl(pattern="\\<pre\\.map\\>", basename(f))) != 1L)
    stop("'pre.map' is missing")
  
  if (sum(tree_pos <- grepl(pattern="\\<treefile\\>", basename(f))) != 1L)
    stop("'treefile' is missing")
  
  pwd <- getwd()
  setwd(merc_out)
  on.exit(setwd(pwd))
  system("omap2hmap genomes < pre.map > pre.h.map")
  system("makeBreakpointGraph --remove-colinear pre.h.map treefile")
  system("makeBreakpointAlignmentInput --out-dir=breakpoints")
  fsaAlignSegmentDirs(initdir="breakpoints", seqfile="seqs.fasta",
                      outfile="fsa.mfa", fsa.opts=list(logfile="fsa.log"),
                      ncores=detectCores() - 1)
  system("findBreakpoints -r 200 --alignmentFile=fsa.mfa pre.h.map treefile edges breakpoints > myBreakpoints")
  system("breakMap myBreakpoints < pre.h.map > better.h.map")
  system("hmap2omap genomes < better.h.map > better.map")
  system("omap2coordinates < better.map > coordinates")
  
  return(invisible(TRUE))
}


fsaAlignSegmentDirs <- function(initdir = ".", seqfile = "seqs.fasta",
                                outfile = "fsa.mfa", constraints = "cons",
                                skip.completed = TRUE, fsa.opts = list(),
                                ncores = detectCores()) {
  assert_that(has_command('fsa')) 
  segments <- normalizePath(dir(initdir, "^\\d+$", full.names=TRUE))
  segments <- segments[order(as.numeric(split_path(segments)))]
  segdirs <- mcmapply(alignSegmentDir, segment = segments,
                      MoreArgs = list(seqfile = seqfile, outfile = outfile,
                                      constraints = constraints, skip.completed = skip.completed,
                                      fsa.opts = fsa.opts),
                      mc.cores=ncores, USE.NAMES=FALSE)
  
  return(invisible(segdirs))
}


alignSegmentDir <- function(segment, seqfile, constraints, outfile = "fsa.mfa",
                            skip.completed = TRUE, fsa.opts = list()) {
  
  if (skip.completed && file.exists(file.path(segment, outfile)) && 
      file.info(file.path(segment, outfile))$size > 0) {
    return(NULL)
  }
  
  # check for seqfile
  if (!file.exists(file.path(segment, seqfile))) {
    stop("Directory ", sQuote(basename(segment)), " does not have the required seqfile ",
         sQuote(seqFile), .call=FALSE)
  }
  
  # check for optional constraints file
  if (!file.exists(file.path(segment, constraints))) {
    warning("Directory ", sQuote(basename(segment)), " does not have constraints",
            call.=FALSE, immediate.=TRUE)
  } else {
    fsa.opts <- merge_list(fsa.opts, list(mercator=constraints))
  }
  
  cwd <- getwd()
  setwd(segment)
  fsa(seqfile, outfile, fsa.opts)
  setwd(cwd)
}


#' Sequence alignment with fsa
#' 
#' @param seqfile Sequence files in fasta format.
#' @param outfile Multifasta alignment file.
#' @param opts a named list of options for fsa.
#' @param ... Named values interpreted as options for fsa.
#' 
#' @export
fsa <- function (seqfile, outfile = "fsa.mfa", opts = list(), ...) {
  
  if (missing(seqfile)) {
    system("fsa --help")
    return(invisible(NULL))
  }
  
  if (!all(file.exists(seqfile)))
    stop("Can not open input files")
  
  if (length(seqfile) > 1)
    infiles <- paste(infiles, collapse=" ")
  
  args <- merge_list(opts, list(...))
  SysCall("fsa", args = args, stdin = seqfile, stdout = outfile,
          redirection = FALSE, style = "gnu")
}
gschofl/genoslideR documentation built on May 17, 2019, 8:52 a.m.