Nothing
#!/usr/bin/env Rscript --vanilla
rm(list=ls())
selfDir = "/Users/gtan/Repository/Github/Phylogenetics"
selfScripts = list.files(path=selfDir, pattern='.*\\.r', full.names=TRUE, recursive=FALSE)
for(rs in selfScripts){source(rs)}
library(getopt);
spec = matrix(
c('help', 'h', 0, "logical", "print the help usage information",
'TKF', 'T', 1, "character", "91 or 92",
'fasta', 'f', 1, "character", "the input fasta file path",
'out', 'o', 1, "character", "the output file path",
'daymatrix', 'd', 2, "character", "the daymatrix path or the default Gonnet daymtrix will be used.",
'joint', 'j', 0, "logical", "estimate the parameters jointly or not",
'Mu', 'M', 2, "double", "the death rate",
'Indelr', 'r', 2, "double", "the indel length distribution rate",
'Length', 'l', 2, "double", "the expected length or the average length of the pair sequence will be used."
), ncol=5, byrow=TRUE)
Usage = getopt(spec, usage=TRUE)
opt = getopt(spec)
#help was asked for.
if (!is.null(opt$help) || length(opt) == 1){
cat(Usage)
q("no")
}
# initialize some constant values
if(is.null(opt$fasta)){
stop("Please provide the input fasta sequence path")
}else{
FastaFn = opt$fasta
}
if(is.null(opt$out)){
stop("Please provide the output file path")
}else{
OutFn = opt$out
}
if(is.null(opt$daymatrix)){
PAM1fn = file.path(TKF_DIR, "daymatrix.txt")
}else{
PAM1fn = opt$daymatrix
}
if(is.null(opt$joint)){
JointEstimation = FALSE
if((opt$TKF == "91") && (is.null(opt$Mu))){
stop("When just estimate distance using TKF91, please provide fixed Mu:")
}else if((opt$TKF == "91") && (!is.null(opt$Mu))){
FixedMu = opt$Mu
}
if((opt$TKF == "92") && (is.null(opt$Mu) || is.null(opt$Indelr))){
stop("When just estimate distance using TKF92, please provide fixed Mu and r")
}else if((opt$TKF == "92") && (!is.null(opt$Mu)) && (!is.null(opt$Indelr))){
FixedMu = opt$Mu
Fixedr = opt$Indelr
}
}else{
JointEstimation = TRUE
}
## define some utilities functions
BuiltInLogPAM1 = function(fn){
# Read the PAM1 file generated by darwin, output the suitable logPAM1 in R
# in PAM1 file, the first rom is the AA frequencies for each amino acid in standard order
# The rest 20 rows are the mutation matrix at PAM1. M[i,j] means from i to j.
require(expm)
tablePAM1 = read.table(fn,header=FALSE)
tablePAM1 = as.matrix(tablePAM1)
AAFre = tablePAM1[1,]
AAOrder = c("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V")
names(AAFre) = AAOrder
tableLogPAM1 = logm(tablePAM1[2:21,])
colnames(tableLogPAM1) = AAOrder
rownames(tableLogPAM1) = AAOrder
return(list("PI"=AAFre, "logPAM1"=tableLogPAM1))
}
ReadFasta = function(fn, seqType){
# Read the fasta file, return a list consisting of a array of labels and list of sequences
require(seqinr)
res = read.fasta(file = fn, seqtype=seqType, set.attributes=FALSE)
return(res)
}
AAToInt = function(A){
# Input is the vector of amino acids symbols, output is the corresponding number in AAOrder.
AAOrder = c("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V")
Intlist = c()
for(i in 1:length(A)){
Intlist = append(Intlist, which(AAOrder == A[i]))
}
return(Intlist)
}
## define the TKF92 likelihood functions
Likelihood_TKF92 = function(distance, Mu, r, sequenceA, sequenceB){
if(Mu >=1 || Mu <= 0 || distance < 0 || r < 0 || r >=1) {
return(1e10)
}
require(expm)
SA = length(sequenceA)
SB = length(sequenceB)
if(is.null(opt$Length)){
AvgLen = mean(SA, SB)
}else{
AvgLen = opt$Length
}
sequenceA = AAToInt(sequenceA)
sequenceB = AAToInt(sequenceB)
# PAM matrix at distance
PAMk = expm(distance * logPAM1)
PAMk = as.vector(PAMk)
dyn.load(file.path(TKF_DIR, "dynprog_TKF92.so"))
res = .C("dynprog", as.double(Mu), as.double(distance), as.double(r), as.integer(SA), as.integer(SB), as.double(AvgLen), as.integer(sequenceA), as.integer(sequenceB), as.double(PAMk), as.double(PI), result = as.double(0))
return(res[["result"]])
}
Likelihood3D_TKF92 = function(x,sequenceA, sequenceB){
# the order of x is distance, Mu, r
Likelihood_TKF92(x[1], x[2], x[3],sequenceA, sequenceB)
}
Likelihood1D_TKF92 = function(x, FixedMu, Fixedr, sequenceA, sequenceB){
Likelihood_TKF92(x, FixedMu, Fixedr, sequenceA, sequenceB)
}
## define the TKF91 likelihood functions
Likelihood_TKF91 = function(distance, Mu, sequenceA, sequenceB){
if(Mu >= 1 || Mu <= 0 || distance < 0){
return(1e10)
}
require(expm)
SA = length(sequenceA)
SB = length(sequenceB)
if(is.null(opt$Length)){
AvgLen = mean(SA, SB)
}else{
AvgLen = opt$Length
}
sequenceA = AAToInt(sequenceA)
sequenceB = AAToInt(sequenceB)
# PAM matrix at distance
PAMk = expm(distance * logPAM1)
PAMk = as.vector(PAMk)
dyn.load(file.path(TKF_DIR, "dynprog_TKF91.so"))
res = .C("dynprog", as.double(Mu), as.double(distance), as.integer(SA), as.integer(SB), as.double(AvgLen), as.integer(sequenceA), as.integer(sequenceB), as.double(PAMk), as.double(PI), result = as.double(0))
return(res[["result"]])
}
Likelihood2D_TKF91 = function(x,sequenceA, sequenceB){
# the order of x is distance, Mu
Likelihood_TKF91(x[1], x[2],sequenceA, sequenceB)
}
Likelihood1D_TKF91 = function(x, FixedMu, sequenceA, sequenceB){
Likelihood_TKF91(x, FixedMu, sequenceA, sequenceB)
}
##### The main program
sequences = ReadFasta(FastaFn,"AA")
BuiltInPAM = BuiltInLogPAM1(PAM1fn)
logPAM1 = BuiltInPAM$logPAM1
PI = BuiltInPAM$PI
distanceMatrix = matrix(0,nrow=length(sequences), ncol=length(sequences))
varianceMatrix = distanceMatrix
likelihoodMatrix = distanceMatrix
if(JointEstimation){
MuMatrix = varianceMatrix
if(opt$TKF == "91"){
distanceMatrix2D_1 = distanceMatrix
varianceMatrix2D_1 = distanceMatrix
likelihoodMatrix2D_1 = distanceMatrix
}else if(opt$TKF == "92"){
rMatrix = varianceMatrix
distanceMatrix3D_1 = distanceMatrix
varianceMatrix3D_1 = distanceMatrix
likelihoodMatrix3D_1 = distanceMatrix
}else{
stop("This should not happen. Option TKF should be 91 or 92")
}
}
#library(numDeriv)
library(MASS)
time1 = proc.time()
for(i in 1:(length(sequences)-1)){
for(j in (i+1):length(sequences)){
if(JointEstimation){
sequenceA = sequences[[i]]
sequenceB = sequences[[j]]
if(opt$TKF == "91"){
# initial values of the optimization for distance, Mu
x0 = c(100,0.0033)
res = optim(x0, Likelihood2D_TKF91, gr = NULL, sequenceA, sequenceB, method="Nelder-Mead", control=list(maxit=1000), hessian=TRUE)
res2 = optim(100, Likelihood1D_TKF91, gr = NULL, res$par[2], sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
MuMatrix[i,j] = MuMatrix[j,i] = res$par[2]
distanceMatrix2D_1[i,j] = distanceMatrix2D_1[j,i] = res2$par[1]
varianceMatrix2D_1[i,j] = varianceMatrix2D_1[j,i] = ginv(res2$hessian)[1,1]
likelihoodMatrix2D_1[i,j] = likelihoodMatrix2D_1[j,i] = res2$value
}else if(opt$TKF == "92"){
# initial valus of the optimization for distance, Mu, r
x0 = c(100, 0.0033, 0.5)
res = optim(x0, Likelihood3D_TKF92, gr = NULL, sequenceA, sequenceB, method="Nelder-Mead", control=list(maxit=1000), hessian=TRUE)
res2 = optim(100, Likelihood1D_TKF92, gr = NULL, res$par[2], res$par[3], sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
MuMatrix[i,j] = MuMatrix[j,i] = res$par[2]
rMatrix[i,j] = rMatrix[j,i] = res$par[3]
distanceMatrix3D_1[i,j] = distanceMatrix3D_1[j,i] = res2$par[1]
varianceMatrix3D_1[i,j] = varianceMatrix3D_1[j,i] = ginv(res2$hessian)[1,1]
likelihoodMatrix3D_1[i,j] = likelihoodMatrix3D_1[j,i] = res2$value
}
}else{
sequenceA = sequences[[i]]
sequenceB = sequences[[j]]
if(opt$TKF == "91"){
x0 = 100
res = optim(x0, Likelihood1D_TKF91, gr = NULL, FixedMu, sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
}else if(opt$TKF == "92"){
x0 = 100
res = optim(x0, Likelihood1D_TKF92, gr = NULL, FixedMu, Fixedr, sequenceA, sequenceB, method="Brent", lower=0.5, upper=1000, hessian=TRUE)
distanceMatrix[i,j] = distanceMatrix[j,i] = res$par[1]
varianceMatrix[i,j] = varianceMatrix[j,i] = ginv(res$hessian)[1,1]
likelihoodMatrix[i,j] = likelihoodMatrix[j,i] = res$value
}
}
}
}
## Output the result file
sink(OutFn)
options(digits=15)
if(JointEstimation){
if(opt$TKF == "91"){
cat("distance", "variance", "likelihood", "Mu", "distance2D_1", "variance2D_1", "likelihood2D_1", sep="\t")
cat("\n")
for(i in 1:(length(sequences)-1)){
for(j in (i+1):length(sequences)){
cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j],MuMatrix[i,j], distanceMatrix2D_1[i,j], varianceMatrix2D_1[i,j], likelihoodMatrix2D_1[i,j], sep="\t")
cat("\n")
}
}
}else if(opt$TKF == "92"){
cat("distance", "variance", "likelihood", "Mu", "r", "distance3D_1", "variance3D_1", "likelihood3D_1", sep="\t")
cat("\n")
for(i in 1:(length(sequences)-1)){
for(j in (i+1):length(sequences)){
cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j],MuMatrix[i,j], rMatrix[i,j], distanceMatrix3D_1[i,j], varianceMatrix3D_1[i,j], likelihoodMatrix3D_1[i,j], sep="\t")
cat("\n")
}
}
}
}else{
if(opt$TKF == "91"){
cat("distance", "variance", "likelihood", sep="\t")
cat("\n")
for(i in 1:(length(sequences)-1)){
for(j in (i+1):length(sequences)){
cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j], sep="\t")
cat("\n")
}
}
}else if(opt$TKF == "92"){
cat("distance", "variance", "likelihood", sep="\t")
cat("\n")
for(i in 1:(length(sequences)-1)){
for(j in (i+1):length(sequences)){
cat(distanceMatrix[i,j], varianceMatrix[i,j], likelihoodMatrix[i,j], sep="\t")
cat("\n")
}
}
}
}
cat("Time:\n")
cat(proc.time()- time1)
sink()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.