R/rbh2kaks.R

Defines functions rbh2kaks

Documented in rbh2kaks

#' @title rbh2kaks
#' @name rbh2kaks
#' @description This function calculates Ka/Ks (dN/dS; accoring to
#' \emph{Li (1993)} or \emph{Yang and Nielson (2000)} for each
#' (conditional-)reciprocal best hit (CRBHit) pair. The names of the \code{rbh}
#' columns must match the names of the corresponding \code{cds1} and \code{cds2}
#' \code{DNAStringSet} vectors.
#' @param rbhpairs (conditional-)reciprocal best hit (CRBHit) pair result
#' (see \code{\link[CRBHits]{cds2rbh}}) [mandatory]
#' @param cds1 cds1 sequences as \code{DNAStringSet} or \code{url} for first
#' crbh pairs column [mandatory]
#' @param cds2 cds2 sequences as \code{DNAStringSet} or \code{url} for second
#' crbh pairs column [mandatory]
#' @param model specify codon model either "Li" or "NG86"
#' or one of KaKs_Calculator2 model "NG", "LWL", "LPB", "MLWL",
#' "MLPB", "GY", "YN", "MYN", "MS", "MA", "GNG", "GLWL", "GLPB",
#' "GMLWL", "GMLPB", "GYN", "GMYN" [default: Li]
#' @param plotHistPlot specify if histogram should be plotted [default: TRUE]
#' @param plotDotPlot specify if dotplot should be plotted (mandatory to define
#' \code{gene.position.cds1} and \code{gene.position.cds1}) [default: FALSE]
#' @param dag specify DAGchainer results as obtained via `rbh2dagchainer()`
#' [default: NULL]
#' @param gene.position.cds1 specify gene position for cds1 sequences
#' (see \code{\link[CRBHits]{cds2genepos}}) [default: NULL]
#' @param gene.position.cds2 specify gene position for cds2 sequences
#' (see \code{\link[CRBHits]{cds2genepos}}) [default: NULL]
#' @param tandem.dups.cds1 specify tandem duplicates for cds1 sequences
#' (see \code{\link[CRBHits]{tandemdups}}) [default: NULL]
#' @param tandem.dups.cds2 specify tandem duplicates for cds2 sequences
#' (see \code{\link[CRBHits]{tandemdups}}) [default: NULL]
#' @param colorBy specify if Ka/Ks gene pairs should be colored by "rbh_class",
#' dagchainer", "tandemdups" or "none" [default: none]
#' @param threads number of parallel threads [default: 1]
#' @param ... other codon alignment parameters
#' (see \code{\link[MSA2dist]{cds2codonaln}}) and other plot_kaks parameters
#' (see \code{\link[CRBHits]{plot_kaks}})
#' @return Ka/Ks values
#' @importFrom parallel makeForkCluster stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach %do% %dopar%
#' @importFrom Biostrings DNAString DNAStringSet AAString AAStringSet
#' readDNAStringSet readAAStringSet writeXStringSet width subseq
#' @importFrom stringr word
#' @importFrom MSA2dist dnastring2kaks cds2codonaln indices2kaks
#' @seealso \code{\link[MSA2dist]{dnastring2kaks}},
#' \code{\link[CRBHits]{isoform2longest}},
#' \code{\link[CRBHits]{cds2genepos}},
#' \code{\link[CRBHits]{plot_kaks}},
#' \code{\link[CRBHits]{rbh2dagchainer}},
#' \code{\link[CRBHits]{tandemdups}}
#' @references Li WH. (1993) Unbiased estimation of the rates of synonymous and
#' nonsynonymous substitution. \emph{J. Mol. Evol.}, \bold{36}, 96-99.
#' @references Wang D, Zhang Y et al. (2010) KaKs_Calculator 2.0: a toolkit
#' incorporating gamma-series methods and sliding window strategies.
#' \emph{Genomics Proteomics Bioinformatics.} \bold{8(1)}, 77-80.
#' @references Yang Z and Nielson R. (2000) Estimating synonymous and
#' nonsynonymous substitution rates under realistic evolutionary models.
#' \emph{Mol. Biol. Evol.}, \bold{17(1)}, 32-43.
#' @examples
#' ## load example sequence data
#' data("ath", package="CRBHits")
#' data("aly", package="CRBHits")
#' ## load example CRBHit pairs
#' data("ath_aly_crbh", package="CRBHits")
#' ## only analyse subset of CRBHit pairs
#' ath_aly_crbh$crbh.pairs <- head(ath_aly_crbh$crbh.pairs)
#' ath_aly_crbh.kaks <- rbh2kaks(
#'     rbhpairs=ath_aly_crbh,
#'     cds1=ath,
#'     cds2=aly,
#'     model="Li")
#' head(ath_aly_crbh.kaks)
#' ## plot kaks
#' g.kaks <- plot_kaks(ath_aly_crbh.kaks)
#' @export rbh2kaks
#' @author Kristian K Ullrich

rbh2kaks <- function(rbhpairs, cds1, cds2,
    model="Li",
    plotHistPlot=FALSE,
    plotDotPlot=FALSE,
    dag=NULL,
    gene.position.cds1=NULL,
    gene.position.cds2=NULL,
    tandem.dups.cds1=NULL,
    tandem.dups.cds2=NULL,
    colorBy="none",
    threads=1,
    ...
    ){
    if(attributes(rbhpairs)$CRBHits.class!="crbh"){
        stop("Please obtain rbhpairs via the cds2rbh() or the cdsfile2rbh()
            function")
    }
    selfblast <- attributes(rbhpairs)$selfblast
    if(!is.null(gene.position.cds1)){
        if(attributes(gene.position.cds1)$CRBHits.class!="genepos"){
            stop("Please obtain gene position via the cds2genepos() function or
                add a 'genepos' class attribute")
        }
    }
    if(!is.null(gene.position.cds2)){
        if(attributes(gene.position.cds2)$CRBHits.class!="genepos"){
            stop("Please obtain gene position via the cds2genepos() function or
                add a 'genepos' class attribute")
        }
    }
    if(!is.null(tandem.dups.cds1)){
        if(attributes(tandem.dups.cds1)$CRBHits.class!="tandemdups"){
            stop("Please obtain tandem duplicates via the tandemdups() function
                or add a 'tandemdups' class attribute")
        }
    }
    if(!is.null(tandem.dups.cds2)){
        if(attributes(tandem.dups.cds2)$CRBHits.class != "tandemdups"){
            stop("Please obtain tandem duplicates via the tandemdups() function
                or add a 'tandemdups' class attribute")
        }
    }
    if(!model %in% c("Li", "NG86", "NG", "LWL", "LPB", "MLWL",
        "MLPB", "GY", "YN", "MYN", "MS", "MA", "GNG", "GLWL",
        "GLPB", "GMLWL", "GMLPB", "GYN", "GMYN")){
        stop("Error: either choose model 'Li', 'NG86', 'NG',
            'LWL', 'LPB', 'MLWL', 'MLPB', 'GY', 'YN', 'MYN',
            'MS', 'MA', 'GNG', 'GLWL', 'GLPB', 'GMLWL',
             'GMLPB', 'GYN', 'GMYN'")
    }
    if(plotDotPlot){
        if(is.null(gene.position.cds1) & is.null(gene.position.cds2)){
            stop("Error: Please specify gene.position.cds1 and
                gene.position.cds2")
        }
    }
    #internal function to get cds by name
    get_cds_by_name <- function(x, cds){
        return(cds[names(cds)==x])
    }
    if(is(cds1, "character")){cds1 <- Biostrings::readDNAStringSet(cds1)}
    if(is(cds2, "character")){cds2 <- Biostrings::readDNAStringSet(cds2)}
    names(cds1) <- stringr::word(names(cds1), 1)
    names(cds2) <- stringr::word(names(cds2), 1)
    #doMC::registerDoMC(threads)
    #if (.Platform$OS.type == "windows") {
    #    cl <- parallel::makeCluster(threads)
    #}
    #if (.Platform$OS.type != "windows") {
    #    cl <- parallel::makeForkCluster(threads)
    #}
    #doParallel::registerDoParallel(cl)
    #i <- NULL
    rbhpairs.crbh.pairs <- rbhpairs$crbh.pairs
    cds <- c(
        do.call(c, unlist(lapply(rbhpairs.crbh.pairs$aa1, function(x) {
            get_cds_by_name(x, cds1)}))),
        do.call(c, unlist(lapply(rbhpairs.crbh.pairs$aa2, function(x) {
            get_cds_by_name(x, cds2)}))))
    idx <- lapply(apply(cbind(seq(from=1, to=length(rbhpairs.crbh.pairs$aa1)),
        seq(from=length(rbhpairs.crbh.pairs$aa1)+1,
        to=2*length(rbhpairs.crbh.pairs$aa1))),1,as.list),unlist)
    #rbh.kaks <- foreach::foreach(i=seq(from=1, to=dim(rbhpairs.crbh.pairs)[1]),
        #.combine = rbind) %dopar% {
        #t(MSA2dist::dnastring2kaks(c(
        #    get_cds_by_name(rbhpairs.crbh.pairs[i,1], cds1),
        #    get_cds_by_name(rbhpairs.crbh.pairs[i,2], cds2)), model=model,
        #    isMSA=FALSE, threads=1, ...))
        #CRBHits::cds2kaks(get_cds_by_name(rbhpairs.crbh.pairs[i,1], cds1),
            #get_cds_by_name(rbhpairs.crbh.pairs[i,2], cds2), model=model,
            #kakscalcpath=kakscalcpath, ...)
    #}
    #parallel::stopCluster(cl)
    rbh.kaks <- MSA2dist::indices2kaks(cds, idx, model=model, threads=threads,
        isMSA=FALSE, ...)
    out <- cbind(rbhpairs.crbh.pairs, rbh.kaks)
    attr(out, "CRBHits.class") <- "kaks"
    if(model=="Li"){
        attr(out, "CRBHits.model") <- "Li"
    }
    if(model=="NG86"){
        attr(out, "CRBHits.model") <- "NG86"
    }
    if(model=="NG"){
        attr(out, "CRBHits.model") <- "NG"
    }
    if(model=="LWL"){
        attr(out, "CRBHits.model") <- "LWL"
    }
    if(model=="LPB"){
        attr(out, "CRBHits.model") <- "LPB"
    }
    if(model=="MLWL"){
        attr(out, "CRBHits.model") <- "MLWL"
    }
    if(model=="MLPB"){
        attr(out, "CRBHits.model") <- "MLPB"
    }
    if(model=="GY"){
        attr(out, "CRBHits.model") <- "GY"
    }
    if(model=="YN"){
        attr(out, "CRBHits.model") <- "YN"
    }
    if(model=="MYN"){
        attr(out, "CRBHits.model") <- "MYN"
    }
    if(model=="MS"){
        attr(out, "CRBHits.model") <- "MS"
    }
    if(model=="MA"){
        attr(out, "CRBHits.model") <- "MA"
    }
    if(model=="GNG"){
        attr(out, "CRBHits.model") <- "GNG"
    }
    if(model=="GLWL"){
        attr(out, "CRBHits.model") <- "GLWL"
    }
    if(model=="GLPB"){
        attr(out, "CRBHits.model") <- "GLPB"
    }
    if(model=="GMLWL"){
        attr(out, "CRBHits.model") <- "GMLWL"
    }
    if(model=="GMLPB"){
        attr(out, "CRBHits.model") <- "GMLPB"
    }
    if(model=="GYN"){
        attr(out, "CRBHits.model") <- "GYN"
    }
    if(model=="GMYN"){
        attr(out, "CRBHits.model") <- "GMYN"
    }
    attr(out, "selfblast") <- selfblast
    if(plotHistPlot){
        plot.out <- plot_kaks(out,
            dag=dag,
            gene.position.cds1=gene.position.cds1,
            gene.position.cds2=gene.position.cds2,
            tandem.dups.cds1=tandem.dups.cds1,
            tandem.dups.cds2=tandem.dups.cds2,
            PlotType="h",
            colorBy=colorBy,
            ...)
    }
    if(plotDotPlot){
        plot.out <- plot_kaks(out,
            dag=dag,
            gene.position.cds1=gene.position.cds1,
            gene.position.cds2=gene.position.cds2,
            tandem.dups.cds1=tandem.dups.cds1,
            tandem.dups.cds2=tandem.dups.cds2,
            PlotType="d",
            colorBy=colorBy,
            ...)
    }
    return(out)
}
kullrich/CRBHits documentation built on Nov. 13, 2024, 7:44 a.m.