R/pairwiseKaKs.R

Defines functions renderCodemlInputs computeKsPipeline putGenePairIntoRedis genePairInRedis

Documented in computeKsPipeline genePairInRedis putGenePairIntoRedis renderCodemlInputs

#' Checks wether in the current Redis server there is an entry for gene pair
#' 'gene.pair'
#'
#' @param Character vector of length two holding members A and B of the gene
#' pair.
#'
#' @return Boolean. TRUE of and only if the alphabetically sorted gene
#' accessions 'A_B' appear in the Redis store.
#' @export
genePairInRedis <- function(gene.pair) {
    g.pair <- paste(sort(gene.pair), collapse = "_")
    !is.null(redisGet(g.pair))
}

#' Puts the integer value 1 into the Redis Server using the alphabetically
#' sorted gene accessions in 'gene.pair' as key.
#'
#' @param Character vector of length two holding members A and B of the gene
#' pair.
#'
#' @return The output of calling rredis::redisSet(...)
#' @export
putGenePairIntoRedis <- function(gene.pair) {
    g.pair <- paste(sort(gene.pair), collapse = "_")
    redisSet(g.pair, 1)
}

#' Runs the pipeline to compute the Ks values of gene pair 'x' with coding
#' sequences held in 'cds'. Temporary files are stored in 't.d'. Disclaimer:
#' This function will only work on *nix Operating Systems with 'codeml' and
#' 'tail' installed.
#'
#' @param gene.a A string holding the gene identifier as present in argument
#' \code{cds}.
#' @param gene.b A string holding the gene identifier as present in argument
#' \code{cds}.
#' @param cds an instance of \code{base::list} as generated by
#' \code{seqinr::read.fasta} holding the unaligned coding sequences of the
#' genes in 'x'. Default is \code{GeneFamilies::all.cds}.
#' @param t.d the directory to store the files in, default is tempdir()
#' appended with \code{paste( gene,a, gene.b, collapse='_' )}
#' @param codeml.call the call passed to system(...) in order to start codeml.
#' Default is 'codeml', use option 'GeneFamilies.codeml.call' to set another
#' default.
#'
#' @return A list of three named numeric values: 'Ka', 'Ks', and 'w'. The first
#' two are parsed from the codeml output file and the latter is computed as
#' \code{Ka/Ks}.
#' @references Yang, Z. and Nielsen, R. (2000) Mol. Biol. Evol., 17, 32-43.
#' @export
computeKsPipeline <- function(gene.a, gene.b, cds = all.cds, t.d = file.path(tempdir(), 
    paste(gene.a, gene.b, sep = "_")), codeml.call = getOption("GeneFamilies.codeml.call", 
    "codeml")) {
    if (!file.exists(t.d)) 
        dir.create(t.d, recursive = TRUE)
    x <- c(gene.a, gene.b)
    p.n <- paste(x, collapse = "_")
    tryCatch({
        genes.msa <- alignCodingSequencesPipeline(cds[c(gene.a, gene.b)], work.dir = t.d, 
            gene.group.name = p.n)
        # Found at least a single invalid CDS?
        if (length(genes.msa) == 2) {
            pair.cds.msa.path <- file.path(t.d, paste(p.n, "_CDS_MSA.fasta", sep = ""))
            pair.codeml.in.out <- renderCodemlInputs(x, pair.cds.msa.path)
            orig.dir <- getwd()
            setwd(t.d)
            system(paste(codeml.call, basename(pair.codeml.in.out[["in"]])))
            setwd(orig.dir)
            codeml.out.ln <- system(paste("tail -1", pair.codeml.in.out[["out"]]), 
                intern = TRUE)
            genes.Ka <- as.numeric(regmatches(codeml.out.ln, regexec("\\sdN\\s*=\\s*(\\d+\\.\\d+)", 
                codeml.out.ln))[[1]][[2]])
            genes.Ks <- as.numeric(regmatches(codeml.out.ln, regexec("\\sdS\\s*=\\s*(\\d+\\.\\d+)", 
                codeml.out.ln))[[1]][[2]])
            list(Ka = genes.Ka, Ks = genes.Ks, w = (genes.Ka/genes.Ks))
        } else NA
    }, error = function(e) {
        message("ERROR: Computing Ks value of gene pair '", p.n, "' caused an error:\n", 
            e)
        NA
    })
}


#' Generates the tree and control file to run PAML's codeml in order to compute
#' the Ks of 'gene.pair'
#'
#' @param gene.pair a character of length 2 holding the identifiers of the
#' pair's genes.
#' @param cds.msa.path the file path to the gene pair's codon sequences
#' alignment
#' @param ks.out.path the file path to the gene pair's codeml output file,
#' defaut is generated from 'cds.msa.path' substituting '_CDS_MSA.fasta' with
#' '_codeml_out.txt'
#' @param cds.codeml.path the file path to the gene pair's codeml input file,
#' defaut is generated from 'cds.msa.path' substituting '_CDS_MSA.fasta' with
#' '_codeml_in.cnt'
#' @param tree.path the file path to the gene pair's tree file, defaut is
#' generated from 'cds.msa.path' substituting '_CDS_MSA.fasta' with
#' '_tree.newick'. The tree will be the trivial one: '(A,B);'.
#'
#' @return A list with two string entries: 'in' the path to the control file
#' used as input to PAML's codeml, and 'out' path to the output file generated
#' by PAML's codeml.
#' @export
renderCodemlInputs <- function(gene.pair, cds.msa.path, ks.out.path = sub("_CDS_MSA.fasta", 
    "_codeml_out.txt", cds.msa.path, fixed = TRUE), cds.codeml.path = sub("_CDS_MSA.fasta", 
    "_codeml_in.cnt", cds.msa.path, fixed = TRUE), tree.path = sub("_CDS_MSA.fasta", 
    "_tree.newick", cds.msa.path, fixed = TRUE)) {
    writeLines(paste("(", gene.pair[[1]], ",", gene.pair[[2]], ");", sep = ""), con = tree.path)
    brew(text = codeml.tmpl, output = cds.codeml.path)
    list(`in` = cds.codeml.path, out = ks.out.path)
}

#' Template for brew to generate the control file used as input for PAML's
#' codeml.
#' @export
codeml.tmpl <- "seqfile = <%= cds.msa.path %>\n    treefile = <%= tree.path %>\n     outfile = <%= ks.out.path %>\n       noisy = 0\n     verbose = 0\n     runmode = -2\n   cleandata = 1\n     seqtype = 1\n   CodonFreq = 2\n       model = 2\n     NSsites = 0\n       icode = 0\n       Mgene = 0\n   fix_kappa = 0\n       kappa = 2\n   fix_omega = 0\n       omega = 1\n   fix_alpha = 1\n       alpha = .0\n      Malpha = 0\n       ncatG = 4\n       clock = 0\n       getSE = 0\nRateAncestor = 0\n      method = 0"
asishallab/GeneFamilies documentation built on May 22, 2023, 11:30 a.m.