inst/oldScripts/TKF.r

#!/usr/bin/env Rscript --vanilla
rm(list=ls())

selfDir = "/Users/gtan/Repository/Github/Phylogenetics"
selfScripts = list.files(path=selfDir, pattern='.*\\.r', full.names=TRUE, recursive=FALSE)
for(rs in selfScripts){source(rs)}

library(getopt);
spec = matrix(
    c('help',      'h', 0, "logical", "print the help usage information",
      'TKF',       'T', 1, "character", "91 or 92",
      'fasta',     'f', 1, "character", "the input fasta file path",
      'out',       'o', 1, "character", "the output file path",
      'daymatrix', 'd', 2, "character", "the daymatrix path or the default Gonnet daymtrix will be used.",
      'joint',     'j', 0, "logical", "estimate the parameters jointly or not",
      'Mu',        'M', 2, "double", "the death rate",
      'Indelr',    'r', 2, "double", "the indel length distribution rate",
      'Length',    'l', 2, "double", "the expected length or the average length of the pair sequence will be used."
     ), ncol=5, byrow=TRUE)
Usage = getopt(spec, usage=TRUE)
opt = getopt(spec)
#help was asked for.
if (!is.null(opt$help) || length(opt) == 1){
    cat(Usage)
    q("no")
}

# initialize some constant values
if(is.null(opt$fasta)){
    stop("Please provide the input fasta sequence path")
}else{
    FastaFn = opt$fasta
}
if(is.null(opt$out)){
    stop("Please provide the output file path")
}else{
    OutFn = opt$out
}
if(is.null(opt$daymatrix)){
    PAM1fn = file.path(TKF_DIR, "daymatrix.txt")
}else{
    PAM1fn = opt$daymatrix
}
if(is.null(opt$joint)){
    JointEstimation = FALSE
    if((opt$TKF == "91") && (is.null(opt$Mu))){
        stop("When just estimate distance using TKF91, please provide fixed Mu:")
    }else if((opt$TKF == "91") && (!is.null(opt$Mu))){
        FixedMu = opt$Mu
    }
    if((opt$TKF == "92") && (is.null(opt$Mu) || is.null(opt$Indelr))){
        stop("When just estimate distance using TKF92, please provide fixed Mu and r")
    }else if((opt$TKF == "92") && (!is.null(opt$Mu)) && (!is.null(opt$Indelr))){
        FixedMu = opt$Mu
        Fixedr = opt$Indelr
    }
}else{
    JointEstimation = TRUE
}

## define some utilities functions
BuiltInLogPAM1 = function(fn){
    # Read the PAM1 file generated by darwin, output the suitable logPAM1 in R
    # in PAM1 file, the first rom is the AA frequencies for each amino acid in standard order
    # The rest 20 rows are the mutation matrix at PAM1. M[i,j] means from i to j.
    require(expm)
    tablePAM1 = read.table(fn,header=FALSE)
    tablePAM1 = as.matrix(tablePAM1)
    AAFre = tablePAM1[1,]
    AAOrder = c("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V")
    names(AAFre) = AAOrder
    tableLogPAM1 = logm(tablePAM1[2:21,])
    colnames(tableLogPAM1) = AAOrder
    rownames(tableLogPAM1) = AAOrder
    return(list("PI"=AAFre, "logPAM1"=tableLogPAM1))
}

ReadFasta = function(fn, seqType){
    # Read the fasta file, return a list consisting of a array of labels and list of sequences
    require(seqinr)
    res = read.fasta(file = fn, seqtype=seqType, set.attributes=FALSE)
    return(res)
}

AAToInt = function(A){
    # Input is the vector of amino acids symbols, output is the corresponding number in AAOrder.
    AAOrder = c("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V")
    Intlist = c()
    for(i in 1:length(A)){
        Intlist = append(Intlist, which(AAOrder == A[i]))
    }
    return(Intlist)
}

## define the TKF92 likelihood functions
Likelihood_TKF92 = function(distance, Mu, r, sequenceA, sequenceB){
    if(Mu >=1 || Mu <= 0 || distance < 0 || r < 0 || r >=1) {
        return(1e10)
    }
    require(expm)
    SA = length(sequenceA)
    SB = length(sequenceB)
    if(is.null(opt$Length)){
        AvgLen = mean(SA, SB)
    }else{
        AvgLen = opt$Length
    }
    sequenceA = AAToInt(sequenceA)
    sequenceB = AAToInt(sequenceB)
    #   PAM matrix at distance     
    PAMk = expm(distance * logPAM1)
    PAMk = as.vector(PAMk)
    dyn.load(file.path(TKF_DIR, "dynprog_TKF92.so"))
    res = .C("dynprog", as.double(Mu), as.double(distance), as.double(r), as.integer(SA), as.integer(SB), as.double(AvgLen), as.integer(sequenceA), as.integer(sequenceB), as.double(PAMk), as.double(PI), result = as.double(0))
    return(res[["result"]])
}
Likelihood3D_TKF92 = function(x,sequenceA, sequenceB){
    # the order of x is distance, Mu, r
    Likelihood_TKF92(x[1], x[2], x[3],sequenceA, sequenceB)
}
Likelihood1D_TKF92 = function(x, FixedMu, Fixedr, sequenceA, sequenceB){
    Likelihood_TKF92(x, FixedMu, Fixedr, sequenceA, sequenceB)
}

## define the TKF91 likelihood functions
Likelihood_TKF91 = function(distance, Mu, sequenceA, sequenceB){
    if(Mu >= 1 || Mu <= 0 || distance < 0){
        return(1e10)
    }
    require(expm)
    SA = length(sequenceA)
    SB = length(sequenceB)
    if(is.null(opt$Length)){
        AvgLen = mean(SA, SB)
    }else{
        AvgLen = opt$Length
    }
    sequenceA = AAToInt(sequenceA)
    sequenceB = AAToInt(sequenceB)
    #   PAM matrix at distance     
    PAMk = expm(distance * logPAM1)
    PAMk = as.vector(PAMk)
    dyn.load(file.path(TKF_DIR, "dynprog_TKF91.so"))
    res = .C("dynprog", as.double(Mu), as.double(distance), as.integer(SA), as.integer(SB), as.double(AvgLen), as.integer(sequenceA), as.integer(sequenceB), as.double(PAMk), as.double(PI), result = as.double(0))
    return(res[["result"]])
}
Likelihood2D_TKF91 = function(x,sequenceA, sequenceB){
    # the order of x is distance, Mu
    Likelihood_TKF91(x[1], x[2],sequenceA, sequenceB)
}
Likelihood1D_TKF91 = function(x, FixedMu, sequenceA, sequenceB){
    Likelihood_TKF91(x, FixedMu, sequenceA, sequenceB)
}


##### The main program
sequences = ReadFasta(FastaFn,"AA")
BuiltInPAM = BuiltInLogPAM1(PAM1fn)
logPAM1 = BuiltInPAM$logPAM1
PI = BuiltInPAM$PI
distanceMatrix = matrix(0,nrow=length(sequences), ncol=length(sequences))
varianceMatrix = distanceMatrix
likelihoodMatrix = distanceMatrix
if(JointEstimation){
    MuMatrix = varianceMatrix
    if(opt$TKF == "91"){
        distanceMatrix2D_1 = distanceMatrix
        varianceMatrix2D_1 = distanceMatrix
        likelihoodMatrix2D_1 = distanceMatrix
    }else if(opt$TKF == "92"){
        rMatrix = varianceMatrix
        distanceMatrix3D_1 = distanceMatrix
        varianceMatrix3D_1 = distanceMatrix
        likelihoodMatrix3D_1 = distanceMatrix
    }else{
        stop("This should not happen. Option TKF should be 91 or 92")
    }
}

#library(numDeriv)
library(MASS)
time1 = proc.time()
for(i in 1:(length(sequences)-1)){
    for(j in (i+1):length(sequences)){
        if(JointEstimation){
            sequenceA = sequences[[i]]
            sequenceB = sequences[[j]]
            if(opt$TKF == "91"){
                # initial values of the optimization for distance, Mu
                x0 = c(100,0.0033)
                res = optim(x0, Likelihood2D_TKF91, gr = NULL, sequenceA, sequenceB, method="Nelder-Mead", control=list(maxit=1000), hessian=TRUE)
                res2 = optim(100, Likelihood1D_TKF91, gr = NULL, res$par[2], sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
                distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
                varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
                likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
                MuMatrix[i,j] = MuMatrix[j,i] = res$par[2]
                distanceMatrix2D_1[i,j] = distanceMatrix2D_1[j,i] = res2$par[1]
                varianceMatrix2D_1[i,j] = varianceMatrix2D_1[j,i] = ginv(res2$hessian)[1,1]
                likelihoodMatrix2D_1[i,j] = likelihoodMatrix2D_1[j,i] = res2$value
            }else if(opt$TKF == "92"){
                # initial valus of the optimization for distance, Mu, r
                x0 = c(100, 0.0033, 0.5)
                res = optim(x0, Likelihood3D_TKF92, gr = NULL, sequenceA, sequenceB, method="Nelder-Mead", control=list(maxit=1000), hessian=TRUE)
                res2 = optim(100, Likelihood1D_TKF92, gr = NULL, res$par[2], res$par[3], sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
                distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
                varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
                likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
                MuMatrix[i,j] = MuMatrix[j,i] = res$par[2]
                rMatrix[i,j] = rMatrix[j,i] = res$par[3]
                distanceMatrix3D_1[i,j] = distanceMatrix3D_1[j,i] = res2$par[1]
                varianceMatrix3D_1[i,j] = varianceMatrix3D_1[j,i] = ginv(res2$hessian)[1,1]
                likelihoodMatrix3D_1[i,j] = likelihoodMatrix3D_1[j,i] = res2$value
            }
        }else{
            sequenceA = sequences[[i]]
            sequenceB = sequences[[j]]
            if(opt$TKF == "91"){
                x0 = 100
                res = optim(x0, Likelihood1D_TKF91, gr = NULL, FixedMu, sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
                distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
                varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
                likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
            }else if(opt$TKF == "92"){
                x0 = 100
                res = optim(x0, Likelihood1D_TKF92, gr = NULL, FixedMu, Fixedr, sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
                distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
                varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
                likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
            }
        }
    }
}

## Output the result file
sink(OutFn)
options(digits=15)
if(JointEstimation){
    if(opt$TKF == "91"){
        cat("distance", "variance", "likelihood", "Mu", "distance2D_1", "variance2D_1", "likelihood2D_1", sep="\t")
        cat("\n")
        for(i in 1:(length(sequences)-1)){
            for(j in (i+1):length(sequences)){
                cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j],MuMatrix[i,j], distanceMatrix2D_1[i,j], varianceMatrix2D_1[i,j], likelihoodMatrix2D_1[i,j], sep="\t")
                cat("\n")
            }
        }
    }else if(opt$TKF == "92"){
        cat("distance", "variance", "likelihood", "Mu", "r", "distance3D_1", "variance3D_1", "likelihood3D_1", sep="\t")
        cat("\n")
        for(i in 1:(length(sequences)-1)){
            for(j in (i+1):length(sequences)){
                cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j],MuMatrix[i,j], rMatrix[i,j], distanceMatrix3D_1[i,j], varianceMatrix3D_1[i,j], likelihoodMatrix3D_1[i,j], sep="\t")
                cat("\n")
            }
        }
    }
}else{
    if(opt$TKF == "91"){
        cat("distance", "variance", "likelihood", sep="\t")
        cat("\n")
        for(i in 1:(length(sequences)-1)){
            for(j in (i+1):length(sequences)){
                cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j], sep="\t")
                cat("\n")
            }
        }
    }else if(opt$TKF == "92"){
        cat("distance", "variance", "likelihood", sep="\t")
        cat("\n")
        for(i in 1:(length(sequences)-1)){
            for(j in (i+1):length(sequences)){
                cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j], sep="\t")
                cat("\n")
            }
        }
    }
}
cat("Time:\n")
cat(proc.time()- time1)
sink()
ge11232002/TKF documentation built on May 17, 2019, 12:13 a.m.