GeneFamilies: Analyses different properties of gene families

Documented in addGeneIdsCols2HomologsKaKsTable addGeneIdsCols2OrthologsKaKsTable axt axtForSelectedDistances foregroundNodes getPairsFromDistForIndices inferExpectedFalsePositiveRate KaKsStatistics mrBayesCdsTree mrBayesProteinTree mrBayesProteinTreeFromMixedMSA mrBayesTreeFromMixedMSA parseMemeBranchResults readFubarTable readKaKsCalculatorOutput removeDuplicatedInversePairs removeNodeLabelsAndBranchLengths replaceWithOriginalInFigTree testGetPairsFromDistForIndices tree4Busted writeAxt

#' Brew template to generate a family's BranchREL input file
#' @export
hyphy.branch.site.bf <- "inputRedirect = {};\ninputRedirect[\"01\"]=\"Universal\";\ninputRedirect[\"02\"]=\"Yes\";\ninputRedirect[\"03\"]=\"Yes\";\ninputRedirect[\"04\"]=\"<%= fam.cds.msa.path %>\";\ninputRedirect[\"05\"]=\"<%= fam.tree.4.paml.path %>\";\ninputRedirect[\"06\"]=\"<%= fam.hyphy.branch.site.output.path %>\";\n\nExecuteAFile (\"<%= hyphy.batch.files.dir %>/BranchSiteREL.bf\", inputRedirect);"

#' Brew template to generate a family's MEME input file
#' @export
hyphy.meme.bf <- "inputRedirect = {};\ninputRedirect[\"01\"]=\"Universal\";\ninputRedirect[\"02\"]=\"New Analysis\";\ninputRedirect[\"03\"]=\"<%= fam.cds.msa.path %>\";\ninputRedirect[\"04\"]=\"Custom\";\ninputRedirect[\"05\"]=\"110240\";\ninputRedirect[\"06\"]=\"<%= fam.tree.no.node.labels.path %>\";\ninputRedirect[\"07\"]=\"<%= fam.hyphy.meme.log.path %>\";\ninputRedirect[\"08\"]=\"Estimate dN/dS only\";\ninputRedirect[\"09\"]=\"MEME\";\ninputRedirect[\"10\"]=\"0.1\";\ninputRedirect[\"11\"]=\"N\";\ninputRedirect[\"12\"]=\"<%= fam.hyphy.meme.output.path %>\";\n\nExecuteAFile (\"<%= hyphy.batch.files.dir %>/QuickSelectionDetection.bf\", inputRedirect);"

#' Brew template to generate a family's FUBAR input file
#' @export
hyphy.fubar.bf <- "inputRedirect = {};\ninputRedirect[\"01\"]=\"Universal\";\ninputRedirect[\"02\"]=\"1\";\ninputRedirect[\"03\"]=\"<%= fam.cds.msa.path %>\";\ninputRedirect[\"04\"]=\"<%= fam.tree.no.node.lables.path %>\";\ninputRedirect[\"05\"]=\"20\";\ninputRedirect[\"06\"]=\"5\";\ninputRedirect[\"07\"]=\"2000000\";\ninputRedirect[\"08\"]=\"1000000\";\ninputRedirect[\"09\"]=\"100\";\ninputRedirect[\"10\"]=\"0.5\";\ninputRedirect[\"11\"]=\"<%= fam.hyphy.fubar.output.path %>\";\n\nExecuteAFile (\"<%= hyphy.batch.files.dir %>/FUBAR.bf\", inputRedirect);"

#' Brew template to generate a family's BUSTED input file
#' @export
hyphy.busted.bf <- "inputRedirect = {};\ninputRedirect[\"01\"]=\"Universal\";\ninputRedirect[\"02\"]=\"<%= fam.cds.msa.nexus.path %>\";\ninputRedirect[\"03\"]=\"<%= fam.hyphy.busted.tree.path %>\";\ninputRedirect[\"04\"]=\"Set TEST\";\ninputRedirect[\"05\"]=\"\";\n\nExecuteAFile (\"<%= hyphy.batch.files.dir %>/BUSTED.bf\", inputRedirect);"

#' Brew template to generate a protein group's Mr Bayes input file
#' @export
mr.bayes.prot.tree <- "\nbegin mrbayes;\nlog start filename=<%= mr.bayes.log.file %> replace;\nprset aamodelpr=fixed(wag);\nlset rates=invgamma Ngammacat=4;\nset autoclose=yes;\nmcmc ngen=20000 printfreq=500 samplefreq=10\nnchains=<%= mr.bayes.n.chains %> savebrlens=yes starttree=random\nfilename=<%= mr.bayes.out.file %>;\nsumt filename=<%= mr.bayes.out.file %> burnin=1000 showtreeprobs=yes\ncontype=allcompat Conformat=Simple;\nquit;\nend;\n"

#' Brew template to generate a protein group's Mr Bayes input file using
#' additional e.g. morphological knowledge
#' @export
mr.bayes.mixed.prot.tree <- "\nbegin mrbayes;\n<% for (x in names(mr.bayes.char.sets)) { %>\n<% y <- mr.bayes.char.sets[[x]] %>\ncharset <%= x %> = <%= y[[1]] %> - <%= y[[2]] %>;\n<% } %>\npartition favored = <%= length(mr.bayes.char.sets) %>: <%= paste( names( mr.bayes.char.sets ), collapse=\", \" ) %>;\nset partition = favored;\n\nlog start filename=<%= mr.bayes.log.file %> replace;\nprset aamodelpr=fixed(wag);\nlset rates=invgamma Ngammacat=4;\nset autoclose=yes;\nmcmc ngen=20000 printfreq=500 samplefreq=10\nnchains=<%= mr.bayes.n.chains %> savebrlens=yes starttree=random\nfilename=<%= mr.bayes.out.file %>;\nsumt filename=<%= mr.bayes.out.file %> burnin=1000 showtreeprobs=yes\ncontype=allcompat Conformat=Simple;\nquit;\nend;\n"

#' Brew template to generate a coding sequences group's Mr Bayes input file
#' using additional e.g. morphological knowledge
#' @export
mr.bayes.mixed.cds.tree <- "\nbegin mrbayes;\n<% for (x in names(mr.bayes.char.sets)) { %>\n<% y <- mr.bayes.char.sets[[x]] %>\ncharset <%= x %> = <%= y[[1]] %> - <%= y[[2]] %>;\n<% } %>\npartition favored = <%= length(mr.bayes.char.sets) %>: <%= paste( names( mr.bayes.char.sets ), collapse=\", \" ) %>;\nset partition = favored;\n\nlog start filename=<%= mr.bayes.log.file %> replace;\nlset nst=6 rates=invgamma Nucmodel=Codon;\nset autoclose=yes;\nmcmc ngen=20000 printfreq=500 samplefreq=10\nnchains=<%= mr.bayes.n.chains %> savebrlens=yes starttree=random\nfilename=<%= mr.bayes.out.file %>;\nsumt filename=<%= mr.bayes.out.file %> burnin=1000 showtreeprobs=yes\ncontype=allcompat Conformat=Simple;\nquit;\nend;\n"

#' Brew template to generate an Mr Bayes input file for a multiple alignment of
#' Coding Sequences (CDS MSA).
#' @export
mr.bayes.cds.tree <- "\nbegin mrbayes;\nlog start filename=<%= mr.bayes.log.file %> replace;\nlset nst=6 rates=invgamma Nucmodel=Codon;\nset autoclose=yes;\nmcmc ngen=20000 printfreq=500 samplefreq=10\nnchains=<%= mr.bayes.n.chains %> savebrlens=yes starttree=random\nfilename=<%= mr.bayes.out.file %>;\nsumt filename=<%= mr.bayes.out.file %> burnin=1000 showtreeprobs=yes\ncontype=allcompat Conformat=Simple;\nquit;\nend;\n"

#' Removes node labels and branch lengths from instance of ape::phylo
#'
#' @param phyl.tree instance of class ape::phylo
#'
#' @return A copy of 'phyl.tree' without branch length and without node labels
#' @export
removeNodeLabelsAndBranchLengths <- function(phyl.tree) {
    phyl.tree$node.label <- NULL
    phyl.tree$edge.length <- NULL
    phyl.tree
}

#' Annotates a phylogenetic tree in such amanner that foreground branches are
#' recognized by HYPHY's BUSTED analysis.
#'
#' @param phylo.tree an instance of class ape::phylo
#' @param foreground.nodes integer vector holding the nodes that are selected
#' as foreground
#'
#' @export
#' @return A BUSTED annotated copy of 'phylo.tree'
tree4Busted <- function(phylo.tree, foreground.nodes) {
    if (!is.null(foreground.nodes) && length(foreground.nodes) > 
        0) {
        #' Mark foreground tips / leaves:
        fg.tips <- intersect(1:length(phylo.tree$tip.label), 
            foreground.nodes)
        if (!is.null(fg.tips) && length(fg.tips) > 0) 
            phylo.tree$tip.label[fg.tips] <- paste(phylo.tree$tip.label[fg.tips], 
                "{TEST}", sep = "")
        #' Mark foreground inner nodes:
        fg.in.nds <- which(1:phylo.tree$Nnode %in% (foreground.nodes - 
            length(phylo.tree$tip.label)))
        phylo.tree$node.label <- character(length(phylo.tree$node.label))
        if (!is.null(fg.in.nds) && length(fg.in.nds) > 0) 
            phylo.tree$node.label[fg.in.nds] <- "{TEST}"
    }
    phylo.tree
}

#' Uses KaKs_Calculator for selected pairs of aligned coding sequences to infer
#' Ka and Ks values among other measures. The pairs that are selected are
#' individuated by \code{GeneFamilies::axtForSelectedDistances}.
#'
#' @param cds.msa The result of invoking \code{seqinr::read.alignment}
#' representing the multiple alignment of coding sequences.
#' @param gene.group.name A string representing the name of the gene group for
#' which the argument \code{cds.msa} was generated.
#' @param background.genes A character vector or \code{NULL} specifying a
#' subset of the genes in argument \code{cds.msa}. If such a background is
#' specified sequence based distances are only considered for pairs of genes
#' where one member is taken from the background and the other from the
#' non-background (foreground) subsets. Default is \code{NULL}.
#' @param w.dir The directory in which to store input and output files with
#' which the KaKs_Calculator is invoked. Default is \code{tempdir()}.
#' @param ka.ks.calculator.path The file path under which to find the
#' KaKs_Calculator executable. Default is
#' \code{getOption('GeneFamilies.ka.ks.calculator.path',
#' file.path(path.package('GeneFamilies'), 'KaKs_Calculator'))}.
#' @param ka.ks.calculator.model A string used as a command line argument for
#' the \code{-m} switch of KaKs_Calculator. This argument defines the evolution
#' model to be used. Default is
#' \code{getOption('GeneFamilies.ka.ks.calculator.model', 'GY')}.
#' @param ... additional arguments passed to \code{base::read.table} used to
#' read in the table written out by KaKs_Calculator.
#'
#' @export
#' @return An instance of \code{base::data.frame} the read in tabular result of
#' invoking KaKs_Calculator using the \code{system} function. There is a
#' strange behavior of KaKs_Calculator, sometimes it write out a header line to
#' its result table, when e.g. using model \code{GY}, and sometimes it does
#' not, when using the newer models like \code{GMYN}. The result read in here
#' is assuming NO header line. To get your desired results look for the
#' appropriate row-name in the first column. To pass argument \code{header =
#' TRUE} to \code{base::read.table} use the \code{...} argument.
KaKsStatistics <- function(cds.msa, gene.group.name, background.genes = NULL, 
    w.dir = tempdir(), ka.ks.calculator.path = getOption("GeneFamilies.ka.ks.calculator.path", 
        file.path(path.package("GeneFamilies"), "KaKs_Calculator")), 
    ka.ks.calculator.model = getOption("GeneFamilies.ka.ks.calculator.model", 
        "GY"), ...) {
    gene.group.axt <- file.path(w.dir, paste0(gene.group.name, 
        ".axt"))
    writeAxt(axtForSelectedDistances(cds.msa, background.genes), 
        gene.group.axt)
    gene.group.ka.ks.calc.res <- file.path(w.dir, paste0(gene.group.name, 
        "_KaKs.txt"))
    system(paste(ka.ks.calculator.path, "-i", gene.group.axt, 
        "-o", gene.group.ka.ks.calc.res, "-m", ka.ks.calculator.model))
    read.table(gene.group.ka.ks.calc.res, sep = "\t", comment.char = "", 
        quote = "", na.strings = "NA", stringsAsFactors = FALSE, ...)
}

#' Extracts pairs of genes from a CDS multiple sequence alignment. The ones
#' extracted are identified by computing all pairwise sequence based distances
#' from the multiple sequence alignment. Then those pairs representing the
#' argument statistics, e.g. 'min', 'mean', 'median', and 'max' are used to
#' generate the axt file for subsequent analysis with KaKs_Calculator
#' (https://sourceforge.net/projects/kakscalculator2/).
#'
#' @param cds.msa The result of invoking \code{seqinr::read.alignment}
#' representing the multiple alignment of coding sequences.
#' @param background.genes A character vector or \code{NULL} specifying a
#' subset of the genes in argument \code{cds.msa}. If such a background is
#' specified sequence based distances are only considered for pairs of genes
#' where one member is taken from the background and the other from the
#' non-background (foreground) subsets. Default is \code{NULL}.
#' @param dist.stats A character vector holding the names of the statistics
#' functions that are to be matched with the respective pairwise sequence based
#' distances. Default is \code{getOption('GeneFamilies.cds.msa.dist.stats',
#' c('min', 'median', 'mean', 'max'))}.
#'
#' @return A named list. Names are those of argument \code{dist.stats}. These
#' names can be concatonated with the letter '&' if a pair of genes fulfills
#' the criterion of various statistics. Values are the gene pairs themselfes.
#' Returns \code{NA} if and only if no numeric finite non-NA sequence based
#' pairwise distances could be obtained from the multiple alignment.
#' @export
axtForSelectedDistances <- function(cds.msa, background.genes = NULL, 
    dist.stats = getOption("GeneFamilies.cds.msa.dist.stats", 
        c("min", "median", "mean", "max"))) {
    dist.msa <- dist.alignment(cds.msa)
    num.dists <- if (length(background.genes) == 0 || all(is.na(background.genes))) {
        as.numeric(dist.msa)
    } else {
        all.pairs <- getPairsFromDistForIndices(1:length(dist.msa), 
            dist.msa)
        ap.i <- which(all.pairs[, 1] %in% background.genes & 
            !all.pairs[, 2] %in% background.genes | !all.pairs[, 
            1] %in% background.genes & all.pairs[, 2] %in% background.genes)
        as.numeric(dist.msa[ap.i])
    }
    num.dists.valid <- num.dists[which(!is.na(num.dists) & is.finite(num.dists))]
    dist.stats.vals <- setNames(lapply(dist.stats, function(s) get(s)(num.dists.valid)), 
        dist.stats)
    dsv.i <- which(!is.na(dist.stats.vals) & !is.null(dist.stats.vals))
    if (length(dsv.i) > 0) {
        axt.lst <- list()
        for (dist.i in unique(dist.stats.vals[dsv.i])) {
            i <- which(dist.stats.vals == dist.i)
            stat.i <- paste(names(dist.stats.vals)[i], collapse = "&")
            gene.pair <- getPairsFromDistForIndices(which.min(abs(dist.msa - 
                dist.i)), dist.msa)[1, ]
            axt.lst[[stat.i]] <- cds.msa$seq[which(cds.msa$nam %in% 
                gene.pair)]
        }
        axt.lst
    } else NA
}

#' Identifies the pairs that match the argument indices \code{k} in the
#' argument \code{dist.obj}. E.g. you might want to know which pairs have
#' maximum distance: \code{getPairsFromDistForIndices(which(x.dist ==
#' max(x.dist)), x.dist)} will help you out. Code taken from this post:
#' https://stackoverflow.com/questions/39005958/r-how-to-get-row-column-subscripts-of-matched-elements-from-a-distance-matri
#'
#' @param k an integer vector holding the indices for which to identify the
#' pairs.
#' @param dist.obj An instance of \code{base::dist} that holds the distances
#' and labels from which to extract the pairs.
#'
#' @export
#' @return An instance of \code{base::matrix} with two columns in which each
#' row holds a pair. Cells can be \code{NA} if the argument indices were not
#' valid.
getPairsFromDistForIndices <- function(k, dist.obj) {
    if (!inherits(dist.obj, "dist")) 
        stop("please provide a 'dist' object")
    d.labels <- attr(dist.obj, "Labels")
    n <- attr(dist.obj, "Size")
    valid <- (k >= 1) & (k <= n * (n - 1)/2)
    k.valid <- k[valid]
    j <- rep.int(NA_real_, length(k))
    j[valid] <- floor(((2 * n + 1) - sqrt((2 * n - 1)^2 - 8 * 
        (k.valid - 1)))/2)
    i <- j + k - (2 * n - j) * (j - 1)/2
    cbind(d.labels[j], d.labels[i])
}

#' Tests the function \code{getPairsFromDistForIndices}.
#'
#' @export
#' @return \code{TRUE} if and only of all tests pass.
testGetPairsFromDistForIndices <- function() {
    x.d <- dist(setNames(1:3, LETTERS[1:3]))
    pair.1 <- cbind("A", "C")
    t.1 <- identical(pair.1, getPairsFromDistForIndices(which(x.d == 
        2), x.d))
    pairs.2 <- matrix(c("A", "B", "B", "C"), ncol = 2)
    t.2 <- identical(pairs.2, getPairsFromDistForIndices(which(x.d == 
        1), x.d))
    t.3 <- identical(matrix(rep(NA_character_, 2), ncol = 2), 
        getPairsFromDistForIndices(4, x.d))
    all(c(t.1, t.2, t.3))
}

#' Extracts pairs of genes from a CDS multiple sequence alignment.
#'
#' @param cds.msa an instance of Biostrings::DNAStringSet representing the
#' multiple alignment of coding sequences.
#' @param anchor.gene the gene identifier for which to build pairs with the
#' other genes present in 'cds.msa'.
#'
#' @return A named list. Names are the non anchor gene identifier that are
#' paired with the anchor gene. Values are the gene pairs themselfes.
#' @export
axt <- function(cds.msa, anchor.gene) {
    anchor.cds <- toString(cds.msa[[anchor.gene]])
    other.genes <- setdiff(names(cds.msa), anchor.gene)
    setNames(lapply(other.genes, function(x) {
        list(anchor.cds, toString(cds.msa[[x]]))
    }), other.genes)
}

#' Writes pairwise aligned sequences to a file of AXT format for Ka/Ks
#' calculation.
#'
#' @param axt a named list as returned from function 'axt(...)'
#' @param axt.path the valid file path indicating the output AXT file to
#' generate
#'
#' @return the result of calling readLines 
#' @export
writeAxt <- function(axt, axt.path) {
    axt.lines <- paste(as.character(unlist(lapply(names(axt), 
        function(nm) {
            gene.pair <- axt[[nm]]
            paste(nm, gene.pair[[1]], gene.pair[[2]], sep = "\n")
        }))), collapse = "\n\n")
    writeLines(axt.lines, con = axt.path)
}


#' Identifies duplicated inverse pairs and discards them.
#'
#' @param clst.hmlgs.list a named list in which for each pair member A is the
#' name and member B is the value
#' @param ret.as.list indicates whether to return the result as a list, subset
#' of the original 'clst.hmlgs.list', or as a character matrix of two columns one
#' column per member. For the latter option set 'ret.as.list' to FALSE.
#'
#' @return The subset of pairs that are not inverse duplicates, either as list
#' or as character matrix.
#' @export
removeDuplicatedInversePairs <- function(clst.hmlgs.list, ret.as.list = TRUE) {
    srtd.pairs <- do.call("rbind", lapply(names(clst.hmlgs.list), 
        function(x) {
            sort(c(x, clst.hmlgs.list[[x]]))
        }))
    non.dupl.pairs <- srtd.pairs[!duplicated(srtd.pairs), , drop = FALSE]
    if (ret.as.list) 
        setNames(as.list(non.dupl.pairs[, 2]), non.dupl.pairs[, 
            1]) else non.dupl.pairs
}

#' Identifies those tip and inner nodes of the phylogenetic tree that span
#' subtrees whose tips are true subsets of the argument 'foreground.nodes'.
#'
#' @param phylo.tr instance of ape::phylo representing a phylogenetic tree
#' @param foreground.nodes a vector of tip indices or tip labels that represent
#' candidate foreground tips.
#'
#' @export
#' @return integer vector of inner node indices that span subtrees containing
#' only foreground tips merged with the argument 'foreground.nodes'
foregroundNodes <- function(phylo.tr, foreground.nodes) {
    #' If tip labels are passed as argument infer their index in the tree's nodes
    #' slot:
    if (inherits(foreground.nodes, "character")) 
        foreground.nodes <- which(phylo.tr$tip.label %in% foreground.nodes)
    #' Find those inner nodes that spawn subtrees whose tips are true subsets of
    #' 'foreground.nodes'
    in.nds <- setdiff(phylo.tr$edge[, 1], 1:length(phylo.tr$tip.label))
    in.nds.spawned <- setNames(lapply(in.nds, function(x) Descendants(phylo.tr, 
        x, type = "tips")[[1]]), in.nds)
    union(foreground.nodes, as.integer(names(in.nds.spawned[as.logical(lapply(in.nds.spawned, 
        function(x) {
            length(setdiff(x, foreground.nodes)) == 0
        }))])))
}

#' Uses base::read.table to parse the output of 'KaKs_Calculator'.
#'
#' @param path.2.KaKs.Calc.output the valid file path to the output table to be
#' parsed
#'
#' @return An instance of base::data.frame with three columns: 'Sequence',
#' 'Ka.Ks', 'P.Value.Fisher.'
#' @export
readKaKsCalculatorOutput <- function(path.2.KaKs.Calc.output) {
    read.table(path.2.KaKs.Calc.output, header = TRUE, sep = "\t", 
        comment.char = "", quote = "", colClasses = c("character", 
            rep("NULL", 3), rep("numeric", 2), rep("NULL", 16)))
}

#' Extracts the gene pairs' identifiers from the 'Sequence' column of
#' 'hom.kaks.tbl' and replaces this column with the parse result.
#'
#' @param hom.kaks.tbl the result of calling 'readKaKsCalculatorOutput(...)' on
#' the result file of KaKs_Calculator used on homologs, that were NOT conserved
#' orthologs.
#'
#' @return A modified version of 'hom.kaks.tbl' in which the column 'Sequence'
#' is replaced with two columns 'gene.1' and 'gene.2'.
#' @export
addGeneIdsCols2HomologsKaKsTable <- function(hom.kaks.tbl) {
    x <- do.call("rbind", strsplit(hom.kaks.tbl$Sequence, "~"))
    colnames(x) <- c("gene.1", "gene.2")
    cbind(x, hom.kaks.tbl[, 2:3], stringsAsFactors = FALSE)
}

#' Uses 'gene.1' as the first member of the orthologous gene pair and extracts
#' the gene identifier of the pairs second member from the 'Sequence' column of
#' 'orth.kaks.tbl' and replaces this column with the result.
#'
#' @param orth.kaks.tbl the result of calling 'readKaKsCalculatorOutput(...)'
#' on the result file of KaKs_Calculator used on conserved orthologs.
#' @param gene.1 the identifier of the anchoring gene used within the currently
#' processed group of conserved orthologs.
#'
#' @return A modified version of 'orth.kaks.tbl' in which the column 'Sequence'
#' is replaced with two columns 'gene.1' and 'gene.2'.
#' @export
addGeneIdsCols2OrthologsKaKsTable <- function(orth.kaks.tbl, 
    gene.1) {
    x <- data.frame(gene.1 = gene.1, gene.2 = orth.kaks.tbl$Sequence, 
        stringsAsFactors = FALSE)
    cbind(x, orth.kaks.tbl[, 2:3], stringsAsFactors = FALSE)
}

#' Parses a FUBAR output table for significant posterior probabilities of a
#' codon being subject to positive selection. If any codons fit the criteria, a
#' respective table is returned, otherwise \code{NULL} is returned.
#'
#' @param path.2.fubar.csv a valid file path to the FUBAR csv file to parse
#' @param family.name A string indicating the name of the gene cluster (family)
#' subjected to the FUBAR analysis.
#' @param sign.post.prob The cutoff value the posterior probabilities have to
#' meet (\code{>=}) in order to be considered significant. Default is
#' \code{.9}.
#'
#' @return An instance of base::data.frame holding the tabular FUBAR output
#' with just the following columns: 1. Codon, 2. Post.Prob, 3. BayesFactor, and
#' 4. family. \code{NULL} is returned if no codon matches the significance
#' criteria.
#' @export
readFubarTable <- function(path.2.fubar.csv, family.name, sign.post.prob = 0.9) {
    fubar.txt <- system(paste("awk -F \",\" '{if ( $5 >=", sign.post.prob, 
        ") print $1 \"\\t\" $5 \"\\t\" $7 }'", path.2.fubar.csv), 
        intern = TRUE)
    if (length(fubar.txt) > 1) {
        x <- read.table(text = fubar.txt, skip = 1, stringsAsFactors = FALSE, 
            sep = "\t", comment.char = "", quote = "", colClasses = rep("numeric", 
                2))
        colnames(x) <- c("Codon", "Post.Prob", "BayesFactor")
        x$family <- family.name
        x
    } else NULL
}

#' Translates the FUBAR reported expected absolute NUMBER of false positives
#' into a relative false positive rate for each family within which positively
#' selected codons have been identified.
#'
#' @param fubar.tbl.res a named vector in which names are families and values
#' are data.frames of FUBAR results
#' @param fubar.expct.fls.pos.n named vector in which names are families and
#' values are the expected absolute number of false positives as reported by
#' FUBAR.
#'
#' @return A named vector in which names are families and values are relative
#' false positive rates per family. 
#' @export
inferExpectedFalsePositiveRate <- function(fubar.tbl.res, fubar.expct.fls.pos.n) {
    setNames(as.numeric(lapply(names(fubar.expct.fls.pos.n), 
        function(fam) {
            as.numeric(fubar.expct.fls.pos.n[[fam]])/nrow(fubar.tbl.res[[fam]])
        })), names(fubar.expct.fls.pos.n))
}

#' Generates a valid input file for Mr Bayes, which will generate a Bayesian
#' phylogeny from a multiple amino acid sequence alignment.
#'
#' @param aa.msa an instance of Biostrings::AAStringSet representing the MSA
#' @param mr.bayes.dir a valid file path to the output directory in which to
#' store the Mr Bayes analysis related files
#' @param gene.group.name the name of the group of proteins to generate the
#' Bayesian tree for
#' @param mr.bayes.n.chains number of chains to use in Mr. Bayes, default is
#' 'mc.cores' or 4, if 'mc.cores' is not set.
#'
#' @return NULL
#' @export
mrBayesProteinTree <- function(aa.msa, mr.bayes.dir, gene.group.name, 
    mr.bayes.n.chains = getOption("mc.cores", 4)) {
    mr.bayes.log.file <- paste(gene.group.name, "_mr_bayes_out.log", 
        sep = "")
    mr.bayes.out.file <- paste(gene.group.name, "_mr_bayes_out", 
        sep = "")
    gene.group.msa.nexus.path <- file.path(mr.bayes.dir, paste(gene.group.name, 
        "_AA_MSA.nex", sep = ""))
    write.nexus.data(as.list(aa.msa), gene.group.msa.nexus.path, 
        format = "PROTEIN")
    mr.bayes.inp.file <- file(gene.group.msa.nexus.path, "a")
    brew(text = mr.bayes.prot.tree, output = mr.bayes.inp.file)
    close(mr.bayes.inp.file)
}

#' Generates a valid input file for Mr Bayes, which will generate a Bayesian
#' phylogeny from a multiple coding sequence (codons) alignment.
#'
#' @param cds.msa an instance of Biostrings::DNAStringSet representing the MSA
#' of coding sequences (CODONS)
#' @param mr.bayes.dir a valid file path to the output directory in which to
#' store the Mr Bayes analysis related files
#' @param gene.group.name the name of the group of proteins to generate the
#' Bayesian tree for
#' @param mr.bayes.n.chains number of chains to use in Mr. Bayes, default is
#' 'mc.cores' or 4, if 'mc.cores' is not set.
#'
#' @return NULL
#' @export
mrBayesCdsTree <- function(cds.msa, mr.bayes.dir, gene.group.name, 
    mr.bayes.n.chains = getOption("mc.cores", 4)) {
    mr.bayes.log.file <- paste(gene.group.name, "_mr_bayes_out.log", 
        sep = "")
    mr.bayes.out.file <- paste(gene.group.name, "_mr_bayes_out", 
        sep = "")
    gene.group.msa.nexus.path <- file.path(mr.bayes.dir, paste(gene.group.name, 
        "_CDS_MSA.nex", sep = ""))
    write.nexus.data(as.list(cds.msa), gene.group.msa.nexus.path, 
        format = "dna")
    mr.bayes.inp.file <- file(gene.group.msa.nexus.path, "a")
    brew(text = mr.bayes.cds.tree, output = mr.bayes.inp.file)
    close(mr.bayes.inp.file)
}


#' Generates a valid input file for Mr Bayes, which will generate a Bayesian
#' phylogeny from a mixed multiple sequence alignment, e.g. CDS AND an alignment of
#' additional e.g. morphological data.
#'
#' @param msa.lst A list of multiple sequence alignments, a mix of DNA and
#' Standard data
#' @param msa.types a character vector of the types of each MSA: DNA, Codon, or
#' Standard
#' @param mr.bayes.dir The file directory in which to store the output file
#' @param the name of the analysis. Will be the head of the file name holding
#' the input for Mr Bayes
#'
#' @return NULL
#' @export
mrBayesTreeFromMixedMSA <- function(msa.lst, msa.types, mr.bayes.dir, 
    gene.group.name, mr.bayes.n.chains = getOption("mc.cores", 
        4)) {
    mr.bayes.log.file <- paste(gene.group.name, "_mr_bayes_out.log", 
        sep = "")
    mr.bayes.out.file <- paste(gene.group.name, "_mr_bayes_out", 
        sep = "")
    gene.group.msa.nexus.path <- file.path(mr.bayes.dir, paste(gene.group.name, 
        ".nex", sep = ""))
    gene.ids <- names(msa.lst[[1]])
    mixed.msa <- setNames(lapply(gene.ids, function(x) {
        paste(lapply(msa.lst, function(i.msa) toString(i.msa[[x]])), 
            collapse = "")
    }), gene.ids)
    write.nexus.data(mixed.msa, gene.group.msa.nexus.path, format = "dna")
    nchar.mix <- sum(unlist(lapply(msa.lst, function(x) nchar(x)[[1]])))
    mixed.nex <- readChar(gene.group.msa.nexus.path, file.info(gene.group.msa.nexus.path)$size)
    mixed.nex <- sub("NCHAR=1", paste("NCHAR=", nchar.mix, sep = ""), 
        mixed.nex, fixed = TRUE)
    n <- 1
    mixed.nex <- sub("DATATYPE=DNA", paste("DATATYPE=MIXED(", 
        paste(lapply(1:length(msa.types), function(i) {
            msa.len <- nchar(msa.lst[[i]])[[1]]
            msa.str <- paste(msa.types[[i]], ":", n, "-", msa.len + 
                (n - 1), sep = "")
            n <<- n + msa.len
            msa.str
        }), collapse = ","), ")", sep = ""), mixed.nex, fixed = TRUE)
    writeLines(toString(mixed.nex), gene.group.msa.nexus.path)
    n <- 1
    mr.bayes.char.sets <- setNames(lapply(msa.lst, function(x) {
        msa.lst <- list(n, nchar(x)[[1]] + (n - 1))
        n <<- n + nchar(x)[[1]]
        msa.lst
    }), names(msa.lst))
    gene.group.msa.nexus.file <- file(gene.group.msa.nexus.path, 
        "a")
    brew(text = mr.bayes.mixed.cds.tree, output = gene.group.msa.nexus.file)
    close(gene.group.msa.nexus.file)
}

#' Generates a valid input file for Mr Bayes, which will generate a Bayesian
#' phylogeny from a multiple amino acid sequence alignment AND an alignment of
#' additional e.g. morphological data.
#'
#' @param aa.msa an instance of Biostrings::AAStringSet representing the MSA
#' @param extra.msa a mamed list of additional e.g. morphological data,
#' aligned. Names have to be identical with names of 'aa.msa'.
#' @param mr.bayes.dir a valid file path to the output directory in which to
#' store the Mr Bayes analysis related files
#' @param gene.group.name the name of the group of proteins to generate the
#' Bayesian tree for
#' @param mr.bayes.n.chains number of chains to use in Mr. Bayes, default is
#' 'mc.cores' or 4, if 'mc.cores' is not set.
#'
#' @return NULL
#' @export
mrBayesProteinTreeFromMixedMSA <- function(aa.msa, extra.msa, 
    mr.bayes.dir, gene.group.name, mr.bayes.n.chains = getOption("mc.cores", 
        4)) {
    mr.bayes.log.file <- paste(gene.group.name, "_mr_bayes_out.log", 
        sep = "")
    mr.bayes.out.file <- paste(gene.group.name, "_mr_bayes_out", 
        sep = "")
    gene.group.msa.nexus.path <- file.path(mr.bayes.dir, paste(gene.group.name, 
        "_mixed_AA_extra_MSA.nex", sep = ""))
    mixed.msa <- setNames(lapply(names(aa.msa), function(x) {
        paste(toString(aa.msa[[x]]), extra.msa[[x]], sep = "")
    }), names(aa.msa))
    write.nexus.data(mixed.msa, gene.group.msa.nexus.path, format = "PROTEIN")
    nchar.aa <- unique(nchar(aa.msa))[[1]]
    nchar.extra <- unique(nchar(extra.msa))[[1]]
    mixed.nex <- readChar(gene.group.msa.nexus.path, file.info(gene.group.msa.nexus.path)$size)
    mixed.nex <- sub("NCHAR=1", paste("NCHAR=", (nchar.aa + nchar.extra), 
        sep = ""), mixed.nex, fixed = TRUE)
    mixed.nex <- sub("DATATYPE=PROTEIN", paste("DATATYPE=MIXED(PROTEIN:1-", 
        nchar.aa, ",Standard:", (nchar.aa + 1), "-", (nchar.aa + 
            nchar.extra), ")", sep = ""), mixed.nex, fixed = TRUE)
    writeLines(toString(mixed.nex), gene.group.msa.nexus.path)
    mr.bayes.char.sets <- list(aaseqs = list(1, nchar.aa), extra = list((1 + 
        nchar.aa), (nchar.aa + nchar.extra)))
    gene.group.msa.nexus.file <- file(gene.group.msa.nexus.path, 
        "a")
    brew(text = mr.bayes.mixed.prot.tree, output = gene.group.msa.nexus.file)
    close(gene.group.msa.nexus.file)
}

#' Uses GNU sed to replace the sanitized gene identifiers with the original
#' ones in the Fig-Tree file - consensus tree generated by Mr. Bayes.
#'
#' @param name.mappings data.frame with two columns 'original', and
#' 'sanitized', holding the pairs of original gene ID to sanitized gene ID.
#' @param path.2.fig.tree the valid file path to the tree file to process
#' @param path.2.fig.tree.orig the valid file path to the output file in which
#' the original gene identifiers are used. Must not be identical to
#' 'path.2.fig.tree'. Default is 'path.2.fig.tree' with its suffix '.con.tre'
#' modified to '_original_identifiers.con.tre'.
#'
#' @return NULL
#' @export
replaceWithOriginalInFigTree <- function(name.mappings, path.2.fig.tree, 
    path.2.fig.tree.orig = sub(".con.tre", "_original_identifiers.con.tre", 
        path.2.fig.tree, fixed = TRUE)) {
    system(paste(c("sed", paste("-e 's/\\b", name.mappings$sanitized, 
        "\\b/", name.mappings$original, "/'", sep = ""), path.2.fig.tree, 
        ">", path.2.fig.tree.orig), collapse = " "))
}

#' Parses a HyPhy MEME branch result file to extract significant LEAVES with
#' codons showing signs of positive selection.
#'
#' @param meme.file a valid file path to a MEME branch result file, e.g.
#' 'fam8119_hyphy_meme_output.txt.branches'
#' @param bayes.factor.sign.cutoff The numeric value above which to consider a
#' Bayes-Factor to be significant. Default is highly conservative and set to
#' 100 aka 'decisive'
#' @param fam.name.reg.exp The regular expression used within
#' \code{base::regexec(...)} to extract the gene family's name from the
#' \code{meme.file} argument
#'
#' @return A data.frame with four columns: Family, Site, Branch, and
#' BayesianFactor, where the rows hold the significant results. NULL is
#' returned if no significant results were contained in \code{meme.file}.
#' @export
parseMemeBranchResults <- function(meme.file, bayes.factor.sign.cutoff = 100, 
    fam.name.reg.exp = "/(fam\\d+)/") {
    fam.name <- regmatches(meme.file, regexec(fam.name.reg.exp, 
        meme.file))[[1]][[2]]
    fam.dir <- sub(paste(fam.name, "_hyphy_meme_output.txt.branches", 
        sep = ""), "", meme.file, fixed = TRUE)
    meme.signif.branches <- system(paste("sed -e '1,2d'", meme.file, 
        "| awk -F \",\" '{ if($1 ~ /Site/ || $4 >", bayes.factor.sign.cutoff, 
        "&& $2 ~ /PROT/) { print $1 \"\\t\" $2 \"\\t\" $4 } }'"), 
        intern = TRUE)
    if (!is.null(meme.signif.branches) && !is.na(meme.signif.branches) && 
        length(meme.signif.branches) > 1) {
        fam.name.maps <- read.table(file.path(fam.dir, paste(fam.name, 
            "_name_mappings_table.txt", sep = "")), sep = "\t", 
            header = TRUE, stringsAsFactors = FALSE, quote = "", 
            comment.char = "", na.strings = "", colClasses = rep("character", 
                2))
        x <- cbind(Family = fam.name, read.table(text = meme.signif.branches, 
            sep = "\t", quote = "", comment.char = "", na.strings = "", 
            colClasses = c("integer", "character", "numeric"), 
            header = TRUE, stringsAsFactors = FALSE))
        x$Branch <- replaceWithOriginal(x$Branch, fam.name.maps)
        x
    } else NULL
}
asishallab/GeneFamilies documentation built on July 28, 2024, 11:44 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
asishallab/GeneFamilies
Analyses different properties of gene families

R/selection_funks.R
In asishallab/GeneFamilies: Analyses different properties of gene families

R Package Documentation

Browse R Packages

We want your feedback!

asishallab/GeneFamilies Analyses different properties of gene families

R/selection_funks.R In asishallab/GeneFamilies: Analyses different properties of gene families

R Package Documentation

Browse R Packages

We want your feedback!

asishallab/GeneFamilies
Analyses different properties of gene families

R/selection_funks.R
In asishallab/GeneFamilies: Analyses different properties of gene families