R/dualbrothers.R

"dualbrothers"<- function(seed,alignment,format="interleaved",window.size=NULL,basename=alignment,length=2100000,burnin=100000,subsample=200,par_lambda=NULL,top_lambda=NULL,window_length=10,C=0.3,sigma_alpha=0.75,sigma_mu=0.75,subst_hyper_mean=-1,subst_hyper_variance=-1,diver_hyper_mean=-1,diver_hyper_variance=-1,top_breaks=1,par_breaks=1,step.size=10,boot=0,inputtrees=NULL){
  
 if(is.null(par_lambda)){
  print("'par_lambda' is missing (par_lambda is the hyperprior parameter representing the prior mean number of substitution process change-points)")
  return()
 }
 if(is.null(top_lambda)){
  print("'top_lambda' is missing (top_lambda is the hyperprior parameter representing the prior mean number of topology break-points)")
  return()
 }
   options(scipen = 100)

   write(paste("length:",length), file = paste(basename,".cmdfile",sep=""))
   cat(paste("burnin:",burnin,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("subsample:",subsample,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("par_lambda:",par_lambda,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("top_lambda:",top_lambda,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("window_length:",window_length,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("C:",C,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("sigma_alpha:",sigma_alpha,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("sigma_mu:",sigma_mu,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE) 
   cat(paste("top_breaks:",top_breaks,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   cat(paste("par_breaks:",par_breaks,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   if(subst_hyper_mean!=-1&subst_hyper_variance!=-1&diver_hyper_mean!=-1&diver_hyper_variance!=-1){
    cat(paste("subst_hyper_mean:",subst_hyper_mean,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
    cat(paste("subst_hyper_variance:",subst_hyper_variance,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE) 
    cat(paste("diver_hyper_mean:",diver_hyper_mean,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
    cat(paste("diver_hyper_variance:",diver_hyper_variance,"\n"), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   }
   if(subst_hyper_mean!=-1|subst_hyper_variance!=-1|diver_hyper_mean!=-1|diver_hyper_variance!=-1){
    if(subst_hyper_mean==-1|subst_hyper_variance==-1|diver_hyper_mean==-1|diver_hyper_variance==-1){
     print("All four hyper variables must be provided. At least one is missing so they will all be estimated.")
    }
   }


write.dna<-function (x, file, format = "interleaved", append = FALSE, nbcol = 6,colsep = " ", colw = 10, indent = NULL, blocksep = 1) 
{
    format <- match.arg(format, c("interleaved", "sequential", 
        "fasta"))
    phylip <- if (format %in% c("interleaved", "sequential")) 
        TRUE
    else FALSE
    if (inherits(x, "DNAbin")) 
        x <- as.character(x)
    aligned <- TRUE
    if (is.matrix(x)) {
        N <- dim(x)
        S <- N[2]
        N <- N[1]
        xx <- vector("list", N)
        for (i in 1:N) xx[[i]] <- x[i, ]
        names(xx) <- rownames(x)
        x <- xx
        rm(xx)
    }
    else {
        N <- length(x)
        S <- unique(unlist(lapply(x, length)))
        if (length(S) > 1) 
            aligned <- FALSE
    }
    if (is.null(names(x))) 
        names(x) <- as.character(1:N)
    if (is.null(indent)) 
        indent <- if (phylip) 
            10
        else 0
    if (is.numeric(indent)) 
        indent <- paste(rep(" ", indent), collapse = "")
    if (format == "interleaved") {
        blocksep <- paste(rep("\n", blocksep), collapse = "")
        if (nbcol < 0) 
            format <- "sequential"
    }
    zz <- if (append) 
        file(file, "a")
    else file(file, "w")
    on.exit(close(zz))
    if (phylip) {
        if (!aligned) 
            stop("sequences must have the same length for\n interleaved or sequential format.")
        cat(N, " ", S, "\n", sep = "", file = zz)
        if (nbcol < 0) {
            nb.block <- 1
            nbcol <- totalcol <- ceiling(S/colw)
        }
        else {
            nb.block <- ceiling(S/(colw * nbcol))
            totalcol <- ceiling(S/colw)
        }
        SEQ <- matrix("", N, totalcol)
        for (i in 1:N) {
            X <- paste(toupper(x[[i]]), collapse = "")
            for (j in 1:totalcol) SEQ[i, j] <- substr(X, 1 + 
                (j - 1) * colw, colw + (j - 1) * colw)
        }
        max.nc <- max(nchar(names(x)))
        if (max.nc < 11) 
            max.nc <- 9
        fmt <- paste("%-", max.nc + 1, "s", sep = "")
        names(x) <- sprintf(fmt, names(x))
    }
    if (format == "interleaved") {
        colsel <- if (nb.block == 1) 
            1:totalcol
        else 1:nbcol
        for (i in 1:N) {
            cat(paste(names(x)[i]," ",sep=""), file = zz)
            cat(SEQ[i, colsel], sep = colsep, file = zz)
            cat("\n", file = zz)
        }
        if (nb.block > 1) {
            for (k in 2:nb.block) {
                cat(blocksep, file = zz)
                endcolsel <- if (k == nb.block) 
                  totalcol
                else nbcol + (k - 1) * nbcol
                for (i in 1:N) {
                  cat(indent, file = zz)
                  cat(SEQ[i, (1 + (k - 1) * nbcol):endcolsel], 
                    sep = colsep, file = zz)
                  cat("\n", file = zz)
                }
            }
        }
    }
    if (format == "sequential") {
        if (nb.block == 1) {
            for (i in 1:N) {
                cat(names(x)[i], file = zz)
                cat(SEQ[i, ], sep = colsep, file = zz)
                cat("\n", file = zz)
            }
        }
        else {
            for (i in 1:N) {
                cat(names(x)[i], file = zz)
                cat(SEQ[i, 1:nbcol], sep = colsep, file = zz)
                cat("\n", file = zz)
                for (k in 2:nb.block) {
                  endcolsel <- if (k == nb.block) 
                    totalcol
                  else nbcol + (k - 1) * nbcol
                  cat(indent, file = zz)
                  cat(SEQ[i, (1 + (k - 1) * nbcol):endcolsel], 
                    sep = colsep, file = zz)
                  cat("\n", file = zz)
                }
            }
        }
    }
    if (format == "fasta") {
        for (i in 1:N) {
            cat(">", names(x)[i], file = zz)
            cat("\n", file = zz)
            X <- paste(x[[i]], collapse = "")
            S <- length(x[[i]])
            totalcol <- ceiling(S/colw)
            if (nbcol < 0) 
                nbcol <- totalcol
            nb.lines <- ceiling(totalcol/nbcol)
            SEQ <- character(totalcol)
            for (j in 1:totalcol) SEQ[j] <- substr(X, 1 + (j - 
                1) * colw, colw + (j - 1) * colw)
            for (k in 1:nb.lines) {
                endsel <- if (k == nb.lines) 
                  length(SEQ)
                else nbcol + (k - 1) * nbcol
                cat(indent, file = zz)
                cat(SEQ[(1 + (k - 1) * nbcol):endsel], sep = colsep, 
                  file = zz)
                cat("\n", file = zz)
            }
        }
    }
}




if(format=="fasta"){
small.align = ape::read.dna(paste(alignment,".fasta",sep=""), format = "fasta")
write.dna(small.align,file=paste(basename,".phy",sep=""))
}
if(format=="interleaved"){
small.align = ape::read.dna(paste(alignment,".phy",sep=""), format = "interleaved")
write.dna(small.align,file=paste(basename,".phy",sep=""))
}

align.length = dim(small.align)[2]
taxa.num = dim(small.align)[1]
taxa.names = rownames(small.align)
rownames(small.align) = c(1:taxa.num)-1
sw.trees = list(1)
my.counter = 1

if(is.null(inputtrees)) {
if(taxa.num>6){

 if(is.null(window.size)){
  print("'window.size' is missing")
  return()
 }

for(ws in 1:length(window.size)){
 window.start = 1
 window.end = window.start + window.size[ws] -1
 while (window.end < align.length){
  temp.tree = ape::bionj(ape::dist.dna(small.align[,window.start:window.end]))
  temp.tree$edge.length = NULL
  sw.trees[[my.counter]] = temp.tree
  window.start = window.start + step.size
  window.end = window.end + step.size
  my.counter = my.counter + 1
 }
}

#### bootstrap the dna
if(boot>0){
 for(k in 1:boot){
  for(ws in 1:length(window.size)){
   window.start = 1
   window.end = window.start + window.size[ws] -1
   while (window.end < align.length){
    temp.tree = ape::bionj(ape::dist.dna(small.align[,sample(window.start:window.end,replace=TRUE)]))
    temp.tree$edge.length = NULL
    sw.trees[[my.counter]] = temp.tree
    window.start = window.start + step.size
    window.end = window.end + step.size
    my.counter = my.counter + 1
   }
  }
 }
}

cleaned.sw.trees = unique(sw.trees)
}

if(taxa.num==6){cleaned.sw.trees = ape::read.tree(paste(system.file(package="rbrothers"),"/extdata/smalltrees/six-input.tre",sep=""))}
if(taxa.num==5){cleaned.sw.trees = ape::read.tree(paste(system.file(package="rbrothers"),"/extdata/smalltrees/five-input.tre",sep=""))}
if(taxa.num==4){cleaned.sw.trees = ape::read.tree(paste(system.file(package="rbrothers"),"/extdata/smalltrees/four-input.tre",sep=""))}
if(taxa.num==3){cleaned.sw.trees = ape::read.tree(paste(system.file(package="rbrothers"),"/extdata/smalltrees/three-input.tre",sep=""))}
}

if(!is.null(inputtrees)) cleaned.sw.trees<-unique(ape::read.tree(inputtrees))

tree.num = length(cleaned.sw.trees)


my.tree = ape::root(cleaned.sw.trees[[1]], outgroup="0", resolve.root=TRUE)
ape::write.tree(my.tree, file=paste(basename,"-input.tre",sep=""), append=FALSE)
for (i in 2:tree.num){
  my.tree = ape::root(cleaned.sw.trees[[i]], outgroup="0", resolve.root=TRUE)
  ape::write.tree(my.tree, file=paste(basename,"-input.tre",sep=""), append=TRUE)
}

temp.tree = ape::read.tree(paste(basename,"-input.tre",sep=""))
unique.trees = unique(temp.tree)
ape::write.tree(unique.trees, file=paste(basename,"-input.tre",sep=""))

if(length(unique.trees)<3) {
 print("Tree list must have at least 3 trees. Try the bootstrap argument, see help(dualbrothers).")
 return()
}

   cat(paste("tree_file: ",basename,"-input.tre",sep=""), file = paste(basename,".cmdfile",sep=""),append=TRUE)
   dbs <- rJava::.jnew("DualBrothersForR")
   out <- rJava::.jcall(dbs, "V", "main", c(seed,paste(basename,".cmdfile",sep=""),paste(basename,".phy",sep=""),paste(basename,".post",sep=""))) 
   #system(paste("java DualBrothers ",seed," ",basename,".cmdfile ",basename, ".phy ",basename,".post",sep=""))
   topologyprofile(basename)
   epprofile(basename)

   
probs<-read.table(paste(basename,".topoprob",sep=""))
profile<-read.table(paste(basename,".profile",sep=""))

#### creates a vector of all the tip names culled from the .phy file
P<-as.integer(scan(paste(basename,".phy",sep=""),what='raw',nlines=1)[1])
names<-NULL
for(i in 1:P){
names[i]<-(scan(paste(basename,".phy",sep=""),what='raw',nlines=1,skip=i)[1])
}
breaks<-findbreakpoints(basename)

#### read in the trees
treenum<-dim(probs)[2]-1
trees<-ape::read.tree(paste(basename,".tree",sep=""),tree.names=seq(1:treenum),keep.multi=TRUE)

#### relabel the tips with the names from the .phy file
for(i in 1:treenum){
 for(j in 1:P){
  trees[[i]]$tip.label[j]<-names[as.integer(trees[[i]]$tip.label[j])+1]
 }
}

####
#### read the .post file to find the number of topology breakpoints and log likelihoods
####

lineiloglik<-function(linei){return(sum(as.numeric(linei[5+8*(0:(as.numeric(linei[2])-1))])))}

lineinumbrkpnts<-function(linei){
 numchgpnts<-as.numeric(linei[2])
 ts1<-linei[4]
 returnme<-0
 if(numchgpnts>1) for(i in 2:numchgpnts) if(!linei[4+8*(i-2)]==linei[4+8*(i-1)]){returnme<-returnme+1}
 return(returnme)
}

loglikes<-NULL
numbrkpnts<-NULL
ll <- readLines(paste(basename,".post",sep=""))

for(i in 1:length(ll)){
 ii = unlist(strsplit(ll[[i]]," "))
 ii = ii[which(sapply(ii,nchar,USE.NAMES=FALSE)!=0)]
 loglikes[i]<-lineiloglik(ii)
 numbrkpnts[i]<-lineinumbrkpnts(ii)
}

####
####
####

#### read the command file
cmdf<-read.table(paste(basename,".cmdfile",sep=""))
####


tbr<-list(basename,probs,profile,P,breaks,trees,cmdf,loglikes,numbrkpnts)
class(tbr)<-"db"
names(tbr)<-c("basename","TopologyProfile","EPProfile","numberofsequences","breakpoints","trees","command_file","MCMC_log_likelihood","MCMC_number_of_break_points")
return(tbr)

}
vnminin/rbrothers documentation built on May 3, 2019, 6:38 p.m.