R/TreeFunctions.R

Defines functions getBootstraps makeTrees consensus_tree matching_function_parallel splits_func getSubTaxa bootstrapTrees findSwitches downsampleClone getSeq getNodeSeq collapseNodes scaleBranches getTrees rerootTree buildRAxML buildIgphyml buildPML buildPratchet buildPhylo writeLineageFile readLineages reconIgPhyML bootstrapClones readSwitches makeModelFile readModelFile readFasta writeFasta

Documented in bootstrapTrees buildIgphyml buildPhylo buildPML buildPratchet buildRAxML collapseNodes downsampleClone findSwitches getBootstraps getNodeSeq getSeq getSubTaxa getTrees makeModelFile readFasta readLineages readModelFile reconIgPhyML rerootTree scaleBranches writeLineageFile

# Functions for performing discrete trait analysis on B cell lineage trees

# Write a clone's sequence alignment to a fasta file
# 
# \code{writeFasta} write clone sequences as a fasta file
# @param    c            airrClone object
# @param    fastafile    file to be exported
# @param    germid       sequence id of germline
# @param    trait        trait to include in sequence ids
# @param    empty        don't include real sequence information
#
# @return   Name of exported fasta file.
writeFasta <- function(c, fastafile, germid, trait=NULL, empty=FALSE){
  text <- ""
  if(!is.null(trait)){
    c@data$sequence_id <- paste(c@data$sequence_id,c@data[,trait],sep="_")
  }
  for(i in 1:nrow(c@data)){
    text <- paste0(text,">",c@data[i,]$sequence_id,"\n")
    if(!empty){
      if(c@phylo_seq == "sequence"){
        text <- paste0(text,c@data[i,]$sequence,"\n")
      }else if(c@phylo_seq == "lsequence"){
        text <- paste0(text,c@data[i,]$lsequence,"\n")
      }else if(c@phylo_seq == "hlsequence"){
        text <- paste0(text,c@data[i,]$hlsequence,"\n")
      }else{
        stop(paste("phylo_seq not recognized",c@clone))
      }
    }else{
      text <- paste0(text,"ATG\n")
    }
  }
  text <- paste0(text,">",germid,"\n")
  if(!empty){
    if(c@phylo_seq == "sequence"){
      text <- paste0(text,c@germline,"\n")
    }else if(c@phylo_seq == "lsequence"){
      text <- paste0(text,c@lgermline,"\n")
    }else if(c@phylo_seq == "hlsequence"){
      text <- paste0(text,c@hlgermline,"\n")
    }else{
      stop(paste("phylo_seq not recognized",c@clone))
    }
  }else{
    text <- paste0(text,"ATG\n")
  }
  write(text,file=fastafile,append=FALSE)
  return(fastafile)
}


#' Read a fasta file into a list of sequences
#' \code{readFasta} reads a fasta file
#' @param    file      FASTA file
#'
#' @return   List of sequences
#' @export
readFasta <- function(file){
  f <- readLines(file)
  seqs <- list()
  id <- NA
  for(line in f){
    if(grepl("^>",line)){
      id <- gsub(">","",line)
      seqs[[id]] <- ""
    }else{
      if(is.na(id)){
        stop(paste("Error reading",file))
      }
      seqs[[id]] <- paste0(seqs[[id]],line)
    }
  }
  seqs
}

#' Read in a parsimony model file
#' 
#' \code{readModelFile} Filler
#' @param    file         parimony model file.
#' @param    useambig     use ambiguous naming as specified in the file?
#'
#' @return   A named vector containing the states of the model
#'
#' @seealso \link{makeModelFile}, \link{findSwitches}, \link{getTrees}
#'
#' @export
readModelFile <- function(file, useambig=FALSE){
  #set up pallete
  mfile <- readLines(file)
  mstart <- which(mfile == "#STATES")
  mend <- which(mfile == "")
  mend <- min(mend[which(mfile == "") > mstart])
  states <- mfile[(mstart+1):(mend-1)]
  names(states) <- states
  
  if(useambig){
    stop("Ambiguous states not currently supported, please useambig=FALSE")
    astart <- which(mfile == "#AMBIGUOUS")
    ambigs <- mfile[(astart+1):length(mfile)]
    asplit <- strsplit(ambigs, split=" ")
    ambig <- unlist(lapply(asplit, function(x)x[1]))
    names(ambig) <- unlist(lapply(asplit, function(x)x[2]))
    ambig <- ambig[ambig != "GERM"]
    nambig <- states[!states %in% names(ambig)]
    names(nambig) <- nambig
    ambig <- c(ambig, states)
  }else{
    ambig <- states
  }
  return(ambig)
}

#' Make a parsimony model file
#' 
#' \code{makeModelFile} Filler
#' @param    file          model file name to write.
#' @param    states        vector of states to include in model.
#' @param    constraints   constraints to add to model.
#' @param    exceptions    vector of comma-separated states that are 
#'                         exceptions to constraints
#'
#'
#' @return   Name of model file
#'
#' @details
#' Currently the only option for \code{constraints} is "irrev", which
#' forbids switches moving from left to right in the \code{states} vector.
#'  
#' @seealso \link{readModelFile}, \link{getTrees}, \link{findSwitches}
#'
#' @export
makeModelFile <- function(file, states, constraints=NULL, exceptions=NULL){
  write("#BEGIN", file=file)
  write(length(states), file=file, append=TRUE)
  write("", file=file, append=TRUE)
  write("#STATES", file=file, append=TRUE)
  for(a in states){
    write(paste(a), file=file, append=TRUE)
  }
  write("", file=file, append=TRUE)
  write("#CONSTRAINTS", file=file, append=TRUE)
  if(!is.null(constraints)){
    if(constraints=="irrev"){
      for(i in 1:(length(states)-1)){
        for(j in (i+1):length(states)){
          if(paste0(states[j],",",states[i]) %in% exceptions){
            print(paste("Excepting",paste0(states[j],",",states[i]),
                        "from constraints"))
            next
          }
          write(paste(states[j], states[i], "1000"), 
                file=file, append=TRUE)
        }
      }
    }else{
      for(constraint in constraints){
        cons <- strsplit(constraint,split=",")[[1]]
        write(paste(cons[1], cons[2], "1000"), 
              file=file, append=TRUE)
      }
    }
  }
  write("", file=file, append=TRUE)
  write("", file=file, append=TRUE)
  write("#AMBIGUOUS", file=file, append=TRUE)
  for(a in states){
    write(paste("GERM", a), file=file, append=TRUE)
  }
  return(file)
}

# Read in a switches file from IgPhyML
# 
# \code{readSwitches} Filler
# @param    file          IgPhyML output file.
#
# @return   A named vector containing the states of the model
readSwitches <- function(file){
  t <- read.table(file,sep="\t",stringsAsFactors=FALSE)
  names(t) <- c("REP","FROM","TO","SWITCHES")
  t
}

# Make bootstrap replicate of clonal alignment
# 
# \code{lones} Filler
# @param    clone     \code{airrClone} object
# @param    reps      Number of bootstrap replicates
# @param    partition If "locus" Bootstrap heavy/lights separately
#
# @return   A list of \code{airrClone} objects with 
# bootstrapped sequences
bootstrapClones  <- function(clone, reps=100, partition="locus", by_codon = TRUE){
  if(clone@phylo_seq == "hlsequence"){
    sarray <- strsplit(clone@data$hlsequence,split="")
    garray <- strsplit(clone@hlgermline,split="")[[1]]
    index <- 1:stats::median(nchar(clone@data$hlsequence))
  }else if(clone@phylo_seq == "sequence"){
    sarray <- strsplit(clone@data$sequence,split="")
    garray <- strsplit(clone@germline,split="")[[1]]
    index <- 1:stats::median(nchar(clone@data$sequence))
  }else if(clone@phylo_seq == "lsequence"){
    sarray <- strsplit(clone@data$lsequence,split="")
    garray <- strsplit(clone@lgermline,split="")[[1]]
    index <- 1:stats::median(nchar(clone@data$lsequence))
  }else{
    stop(paste("phyloseq option",clone@phylo_seq,"not recognized"))
  }
  bootstraps <- list()
  for(i in 1:reps){
    clone_copy <- clone
    if(by_codon == TRUE){
      if(partition == "locus"){
        sindex <- unlist(lapply(unique(clone@locus), function(x){
          locus = which(clone@locus == x)
          seq = seq(1,length(locus),by=3)
          codons=lapply(seq, function(x)c(locus[x],locus[x+1],locus[x+2]))
          codons = sample(codons, replace=TRUE)
          return(codons)}))
        
      }else{
        seq = seq(1,length(index),by=3)
        codons=lapply(seq, function(x)c(index[x],index[x+1],index[x+2]))
        codons = sample(codons, replace=TRUE)
        sindex <- unlist(codons)
      }
    }else{
      if(partition == "locus"){
        sindex <- unlist(lapply(unique(clone@locus), function(x){
          locus = which(clone@locus == x)
          seq = seq(1,length(locus),by=1)
          codons=lapply(seq, function(x)c(locus[x]))
          codons = sample(codons, replace=TRUE)
          return(codons)}))
      }else{
        sindex <- sample(index,length(index),replace=TRUE)
      }
    }
    if(clone@phylo_seq == "sequence"){

      clone_copy@data$sequence <- unlist(lapply(sarray,
                                                function(x)paste(x[sindex],collapse="")))
      clone_copy@germline <- paste(garray[sindex],collapse="")
      if(by_codon == TRUE){
        ## add check step where you translate it and search for \\*
        check <- grepl("\\*", alakazam::translateDNA(clone_copy@data$sequence))
        if(TRUE %in% check){
          trues <- c()
          for(i in 1:length(check)){
            if(check[i] == TRUE){
              trues <- append(trues, clone_copy@data$sequence_id[i])
            }
          }
          stop(paste0("Stop codon(s) have been found in sequence(s) ", trues, " of clone ", clone_copy@clone))
        }
      }
    }else if(clone@phylo_seq == "hlsequence"){
      clone_copy@data$hlsequence <- unlist(lapply(sarray,
                                                  function(x)paste(x[sindex],collapse="")))
      clone_copy@hlgermline <- paste(garray[sindex],collapse="")
      if(by_codon == TRUE){
        ## add check step where you translate it and search for \\*
        check <- grepl("\\*", alakazam::translateDNA(clone_copy@data$hlsequence))
        if(TRUE %in% check){
          trues <- c()
          for(i in 1:length(check)){
            if(check[i] == TRUE){
              trues <- append(trues, clone_copy@data$sequence_id[i])
            }
          }
          stop(paste0("Stop codon(s) have been found in sequence(s) ", trues, " of clone ", clone_copy@clone))
        }
      }
    }else if(clone@phylo_seq == "lsequence"){
      clone_copy@data$lsequence <- unlist(lapply(sarray,
                                                 function(x)paste(x[sindex],collapse="")))
      clone_copy@lgermline <- paste(garray[sindex],collapse="")
      if(by_codon == TRUE){
        ## add check step where you translate it and search for \\*
        check <- grepl("\\*", alakazam::translateDNA(clone_copy@data$lsequence))
        if(TRUE %in% check){
          trues <- c()
          for(i in 1:length(check)){
            if(check[i] == TRUE){
              trues <- append(trues, clone_copy@data$sequence_id[i])
            }
          }
          stop(paste0("Stop codon(s) have been found in sequence(s) ", trues, " of clone ", clone_copy@clone))
        }
      }
    }else{
      stop(paste("phylo_seq",clone@phylo_seq,"not recognized"))
    }
    bootstraps[[i]] <- clone_copy
  }
  bootstraps
}

#' Do IgPhyML maximum parsimony reconstruction
#' 
#' \code{reconIgPhyML} IgPhyML parsimony reconstruction function
#' @param    file          IgPhyML lineage file (see writeLineageFile)
#' @param    modelfile      File specifying parsimony model
#' @param    id            id for IgPhyML run
#' @param    igphyml       location of igphyml executable
#' @param    mode          return trees or count switches? (switches or trees)
#' @param    type          get observed switches or permuted switches?
#' @param    nproc         cores to use for parallelization
#' @param    quiet         amount of rubbish to print
#' @param    rm_files      remove temporary files?
#' @param    rm_dir        remove temporary directory?
#' @param    states        states in parsimony model
#' @param    palette       deprecated
#' @param    resolve       level of polytomy resolution. 0=none, 
#'                         1=maximum parsimony, 2=maximum ambiguity
#' @param    rseed         random number seed if desired
#' @param    force_resolve continue even if polytomy resolution fails?
#' @param    ...           additional arguments
#'
#' @return   Either a tibble of switch counts or a list
#'           of trees with internal nodes predicted by parsimony.
#' @export
reconIgPhyML <- function(file, modelfile, id, 
                         igphyml="igphyml", mode="switches", type="recon",
                         nproc=1, quiet=0, rm_files=FALSE, rm_dir=NULL, 
                         states=NULL, palette=NULL, resolve=2, rseed=NULL,
                         force_resolve=FALSE, ...){
  
  #args <- list(...)
  igphyml <- path.expand(igphyml)
  if(file.access(igphyml, mode=1) == -1) {
    stop("The file ", igphyml, " cannot be executed.")
  }
  if(!file.exists(file)) {
    stop("The repertoire file ", file, " cannot be found.")
  }
  if(!file.exists(modelfile)) {
    stop("The model file ", modelfile, " cannot be found.")
  }
  if(!mode %in% c("switches","trees")){
    stop(paste("mode must be either switches or trees"))
  }
  if(!type %in% c("recon","permute","permuteAll")){
    stop(paste("type must be either recon, permute, or permuteAll"))
  }
  
  if(quiet > 0){
    print(paste("Resolve:",resolve,"Force resolve?",force_resolve))
  }
  recon <- paste0(file,"_igphyml_parstats_",type,".txt")
  logfile <- paste0(file,".log")
  log <- paste(">>",logfile)
  permute <- ""
  force_resolve_option <- ""
  if(type == "permute"){
    permute <- "--permute"
  }
  if(type == "permuteAll"){
    permute <- "--permuteAll"
  }
  if(force_resolve){
    force_resolve_option <- "--force_resolve"
  }
  if(is.null(rseed)){
    rseed <- ""
  }else{
    rseed <- paste("--r_seed",rseed)
  }
  command <- paste("--repfile",file,
                   "--recon",modelfile,"--threads",nproc,"--polyresolve",resolve,
                   "-m HLP -o n --motifs WRC_2:0 --hotness 0 --run_id",type,permute,
                   force_resolve_option,rseed,log)
  params <- list(igphyml,command,stdout=TRUE,stderr=TRUE)
  if(quiet > 2){
    print(paste(params,collapse=" "))
  }
  status <- tryCatch(do.call(base::system2, params), error=function(e){
    print(paste("igphyml error, trying again: ",e));
    cat(paste(readLines(logfile),"\n"))
    return(e)
  }, warning=function(w){
    print(paste("igphyml warnings, trying again: ",w));
    cat(paste(readLines(logfile),"\n"))
    return(w)
  })
  if(length(status) != 0){
    status <- tryCatch(do.call(base::system2, params), error=function(e){
      print(paste("igphyml error, again! quitting: ",e));
      cat(paste(readLines(logfile),"\n"))
      stop()
    }, warning=function(w){
      print(paste("igphyml warnings, again! quitting: ",w));
      cat(paste(readLines(logfile),"\n"))
      stop()
    })
  }
  
  if(quiet > 2){
    cat(paste(readLines(logfile),"\n"))
  }
  if(mode == "switches"){
    recons <- readSwitches(recon)
    recons$CLONE <- id
    recons$TYPE <- toupper(type)
    results <- dplyr::as_tibble(recons)
  }else{
    if(is.null(states)){
      states <- readModelFile(modelfile)
    }
    if(!is.null(palette)){
      warning("palette option is deprecated in reconIgPhyML")
      palette <- NULL
    }
    results <- readLineages(file,states,palette,"recon",quiet)
    results <- lapply(results,function(x){
      x$pars_recon="igphyml";
      x})
    results <- lapply(results,function(x){
      x$tip.label;
      x})
  }
  if(rm_files){
    lines <- readLines(file)
    for(i in 2:length(lines)){
      temp <- strsplit(lines[i],split="\t")[[1]]
      unlink(paste0(temp[1],"*"))
      unlink(paste0(temp[2],"*"))
    }
    unlink(paste0(file,"*"))
  }
  if(!is.null(rm_dir)){
    if(quiet > 1){
      print(paste("rming dir",rm_dir))
    }
    unlink(rm_dir,recursive=TRUE)
  }
  return(results)
}

#' Read in all trees from a lineages file
#' 
#' @param    file    IgPhyML lineage file
#' @param    states  states in parsimony model
#' @param    palette deprecated
#' @param    run_id  id used for IgPhyML run
#' @param    quiet   avoid printing rubbish on screen?
#' @param    append  string appended to fasta files
#' @param    format  format of input file with trees
#' @param    type    Read in parsimony reconstructions or ancestral sequence
#'                   reconstructions? "jointpars" reads in parsimony states, 
#'                   others read in sequences in internal nodes
#'
#' @return   A list of phylo objects from \code{file}.
#' @export
readLineages <- function(file, states=NULL, palette=NULL,
                         run_id="", quiet=TRUE, append=NULL, format="nexus", 
                         type="jointpars"){
  if(!is.null(palette)){
    warning("palette is deprecated in readLineages")
    palette <- NULL
  }
  trees <- list()
  t <- readLines(file)
  if(length(t) == 0){
    return(list())
  }
  for(i in 2:length(t)){
    fasta <- strsplit(t[i],split="\t")[[1]][1]
    if(quiet > 0){print(fasta)}
    if(is.null(append)){
      if(run_id == ""){
        tf <- suppressWarnings(phylotate::read_annotated(
          paste0(fasta,"_igphyml_",type,".nex"),
          format=format))
      }else{
        tf <- suppressWarnings(phylotate::read_annotated(
          paste0(fasta,"_igphyml_",type,"_",run_id,".nex"),
          format=format))
      }
    }else{
      tf <- suppressWarnings(read_annotated(paste0(fasta,append),
                                            format=format))
    }
    if(!is.null(states)){
      tf <- condenseTrees(tf,states,palette)
    }
    germ <- tf$tip.label[grep("_GERM",tf$tip.label)]
    tf$name <- gsub("_GERM$","",germ)
    tf$tip.label[which(tf$tip.label == germ)] <- "Germline"
    nnodes <- length(unique(c(tf$edge[,1],tf$edge[,2])))
    tf$nodes <- rep(list(sequence=NULL),times=nnodes)
    if(type=="jointpars"){
      tf <- rerootTree(tf,"Germline",verbose=0)
    }else{
      tf$nodes <- lapply(1:length(tf$nodes),function(x){
        tf$nodes[[x]]$sequence <- tf$node.comment[x]
        tf$nodes[[x]]
      })
    }
    trees[[i-1]] <- tf
  }
  return(trees)
}


#' Write lineage file for IgPhyML use
#' 
#' @param    data      list of \code{airrClone} objects
#' @param    trees     list of \code{phylo} objects corresponding to \code{data}
#' @param    dir       directory to write file
#' @param    id        id used for IgPhyML run
#' @param    rep       bootstrap replicate
#' @param    trait     string appended to sequence id in fasta files
#' @param    partition how to partition omegas
#' @param    heavy     name of heavy chain locus
#' @param    empty     output uninformative sequences?
#' @return   Name of created lineage file.
#' @export
writeLineageFile <- function(data, trees=NULL, dir=".", id="N", rep=NULL, 
                             trait=NULL, empty=TRUE, partition="single", heavy="IGH"){
  
  file <- file.path(dir,paste0(id,"_lineages_pars.tsv"))
  if(!is.null(rep)){
    file <- file.path(dir,paste0(id,"_lineages_",rep,"_pars.tsv"))
  }
  outdir <- file.path(dir,paste0(id,"_recon_",rep))
  dir.create(dir,showWarnings=FALSE)
  dir.create(outdir,showWarnings=FALSE)
  
  dnames <- unlist(lapply(data,function(x)x@clone))
  tnames <- unlist(lapply(trees,function(x)x$name))
  if(sum(tnames != dnames) != 0){
    trees <- trees[order(match(tnames,dnames))]
  }
  write(length(data),file=file)
  for(i in 1:length(data)){
    tree <- trees[[i]]
    fastafile <- file.path(outdir,paste0(data[[i]]@clone,".fasta"))
    treefile <- file.path(outdir,paste0(data[[i]]@clone,".tree"))
    germid <- paste0(data[[i]]@clone,"_GERM")
    writeFasta(data[[i]],fastafile,germid,trait,empty=empty)
    if(data[[i]]@phylo_seq == "sequence"){
      g <- data[[i]]@germline
    }else if(data[[i]]@phylo_seq == "lsequence"){
      g <- data[[i]]@lgermline
    }else if(data[[i]]@phylo_seq == "hlsequence"){
      g <- data[[i]]@hlgermline
    }else{
      stop(paste("phylo_seq not recognized",c@clone))
    }
    
    if(!partition %in% c("single","hl")){
      acceptable <- c("fwr1","fwr2","fwr3","fwr4","cdr1","cdr2","cdr3")
      unacceptable <- unlist(lapply(data, function(x)sum(!x@region %in% acceptable) > 0))
      exclude_clones <- unlist(lapply(data[unacceptable], function(x)x@clone))
      if(length(exclude_clones) > 0){
        stop(paste("non-standard regions found in these clones,",
                   "either remove or set partition='single':",paste(exclude_clones, collapse=","),
                   "\nAllowable regions:",paste(acceptable,collapse=",")))
      }
    }
    
    if(partition == "cf"){ #make file specifying sequence regions
      nomega <- 2
      regions <- data[[i]]@region
      if(dplyr::n_distinct(regions) == 1){
        warning(paste("Only one region found in clone",data[[i]]@clone))
      }
      cdrs <- rep(0,length(regions))
      cdrs[regions == "fwr1"] <- 13
      cdrs[regions == "cdr1"] <- 30
      cdrs[regions == "fwr2"] <- 45
      cdrs[regions == "cdr2"] <- 60
      cdrs[regions == "fwr3"] <- 80
      cdrs[regions == "cdr3"] <- 108
      cdrs[regions == "fwr4"] <- 120
    }else if(partition == "hl"){
      nomega <- 2
      chains <- data[[i]]@locus
      if(dplyr::n_distinct(chains) == 1){
        warning(paste("Only one chain found in clone",data[[i]]@clone))
      }
      cdrs <- rep(0,length(chains))
      cdrs[chains == heavy] <- 13
      cdrs[chains != heavy] <- 30
    }else if(partition == "hlc"){
      nomega <- 3
      chains <- data[[i]]@locus
      regions <- data[[i]]@region
      if(dplyr::n_distinct(regions) == 1){
        warning(paste("Only one region found in clone",data[[i]]@clone))
      }
      if(dplyr::n_distinct(chains) == 1){
        warning(paste("Only one chain found in clone",data[[i]]@clone))
      }
      cdrs <- rep(0,length(chains))
      cdrs[grepl("fwr", regions)] <- 13
      cdrs[chains == heavy & grepl("cdr", regions)] <- 30  #heavy cdr
      cdrs[chains != heavy & grepl("cdr", regions)] <- 200 #light cdr
    }else if(partition == "hlf"){
      nomega <- 3
      chains <- data[[i]]@locus
      regions <- data[[i]]@region
      if(dplyr::n_distinct(regions) == 1){
        warning(paste("Only one region found in clone",data[[i]]@clone))
      }
      if(dplyr::n_distinct(chains) == 1){
        warning(paste("Only one chain found in clone",data[[i]]@clone))
      }
      cdrs <- rep(0,length(chains))
      cdrs[chains == heavy & grepl("fwr", regions)] <- 13 #heavy fwr
      cdrs[grepl("cdr", regions)] <- 30
      cdrs[chains != heavy & grepl("fwr", regions)] <- 200 #light fwr
    }else if(partition == "hlcf"){
      nomega <- 4
      chains <- data[[i]]@locus
      regions <- data[[i]]@region
      if(dplyr::n_distinct(regions) == 1){
        warning(paste("Only one region found in clone",data[[i]]@clone))
      }
      if(dplyr::n_distinct(chains) == 1){
        warning(paste("Only one chain found in clone",data[[i]]@clone))
      }
      cdrs <- rep(0,length(chains))
      cdrs[chains == heavy & grepl("fwr", regions)] <- 13
      cdrs[chains == heavy & grepl("cdr", regions)] <- 30
      cdrs[chains != heavy & grepl("fwr", regions)] <- 200 #light fwr
      cdrs[chains != heavy & grepl("cdr", regions)] <- 300 #light cdr
    }else if(partition != "single"){
      stop(paste("Partition",partition,"not recognized"))
    }
    
    if(partition != "single"){
      partfile <- file.path(outdir,paste0(data[[i]]@clone,".part.txt"))
      write(paste(nomega,nchar(g)/3), file=partfile)
      write("FWR:IMGT\nCDR:IMGT", file=partfile, append=TRUE)
      if(partition == "hlf" || partition == "hlcf"){
        write("FWRL:IMGT", file=partfile, append=TRUE)
      }
      if(partition == "hlc" || partition == "hlcf"){
        write("CDRL:IMGT", file=partfile, append=TRUE)
      }
      write(paste(data[[i]]@v_gene,data[[i]]@j_gene,sep="\n"), file=partfile, append=TRUE)
      write(paste(cdrs[1:length(cdrs) %% 3 == 0],collapse=","), file=partfile, append=TRUE)
    }else{
      partfile <- "N"
    }
    
    if(!is.null(trees)){
      if("node.label" %in% names(tree)){
        tree$node.label <- NULL
      }
      tree$tip.label[which(tree$tip.label == "Germline")] <- germid
      tree <- ape::multi2di(tree)
      ape::write.tree(tree,file=treefile)
      write(paste(fastafile,treefile,germid,partfile,sep="\t"), file=file,
            append=TRUE)
    }else{
      write(paste(fastafile,"N",germid,partfile,sep="\t"), file=file,
            append=TRUE)
    }
  }
  return(file)
}

#' Wrapper for alakazam::buildPhylipLineage
#' 
#' @param    clone      \code{airrClone} object
#' @param    exec       dnapars or dnaml executable
#' @param    temp_path  path to temporary directory
#' @param    verbose    amount of rubbish to print
#' @param    rm_temp    remove temporary files?
#' @param    seq        sequece column in \code{airrClone} object
#' @param    tree       fixed tree topology if desired (currently does nothing
#'                      if specified)
#' @param    onetree    Only sample one tree if multiple found.
#'
#' @return  \code{phylo} object created by dnapars or dnaml with nodes attribute
#'          containing reconstructed sequences.
#' @export
buildPhylo <- function(clone, exec, temp_path=NULL, verbose=0,
                       rm_temp=TRUE, seq="sequence", tree=NULL, onetree=TRUE){
  
  if(grepl("dnaml$",exec) | grepl("dnaml\\.exe$",exec)){
    method <- "dnaml"
  }else if(grepl("dnapars$",exec) | grepl("dnapars\\.exe$",exec)){
    method <- "dnapars"
  }else{
    stop("executable not recognized! Must end with dnapars or dnaml")
  }
  if(seq != "sequence"){
    clone@data$sequence <- clone@data[[seq]]
    if(seq == "hlsequence"){
      clone@germline <- clone@hlgermline
    }else if(seq == "lsequence"){
      clone@germline <- clone@lgermline
    }
  }
  if(nrow(clone@data) < 2){
    stop("Clone ",paste0(clone@clone," has only one sequence, skipping"))
  }
    if(is.null(tree)){
        tree <- tryCatch(
            alakazam::buildPhylipLineage(clone, exec, rm_temp=rm_temp,
                branch_length="distance", verbose=verbose>0,
                temp_path=temp_path,onetree=onetree),
            error=function(e)e)
        if(is.null(tree)){
            stop(paste0("buildPhylipLineage failed for clone ",clone@clone))
        }
        if(inherits(tree, "error")){
            stop(tree)
        }
    tree <- alakazam::graphToPhylo(tree)
    tree <- rerootTree(tree, germline="Germline",verbose=0)
    tree <- ape::ladderize(tree,right=FALSE)
    tree$name <- clone@clone
    tree$tree_method <- paste0("phylip::",method)
    tree$edge_type <- "genetic_distance"
    tree$seq <- seq
  }
  tree
}

#' Wrapper for phangorn::pratchet
#' 
#' @param    clone           \code{airrClone} object
#' @param    seq             sequece column in \code{airrClone} object
#' @param    asr             return sequence or probability matrix?
#' @param    asr_thresh      threshold for including a nucleotide as an alternative
#' @param    tree            fixed tree topology if desired.
#' @param    asr_type        MPR or ACCTRAN
#' @param    verbose         amount of rubbish to print
#' @param    resolve_random  randomly resolve polytomies?
#' @param    data_type       Are sequences DNA or AA?
#' @return  \code{phylo} object created by phangorn::pratchet with nodes
#'          attribute containing reconstructed sequences.
#' @export
buildPratchet <- function(clone, seq="sequence", asr="seq", asr_thresh=0.05, 
                          tree=NULL, asr_type="MPR", verbose=0, resolve_random=TRUE,
                          data_type="DNA"){
  seqs <- clone@data[[seq]]
  names <- clone@data$sequence_id
  if(verbose > 0){
    print(clone@clone)
  }
  if(length(seqs) < 2){
    stop(paste0(clone@clone," has only one sequence, skipping"))
  }
  if(seq == "hlsequence"){
    germline <- clone@hlgermline
  }else if(seq == "lsequence"){
    germline <- clone@lgermline
  }else{
    germline <- clone@germline
  }
  seqs <- base::append(seqs,germline)
  names <- c(names,"Germline")
  seqs <- strsplit(seqs,split="")
  names(seqs) <- names
  lengths = unlist(lapply(seqs,function(x)length(x)))
  if(any(lengths != lengths[1])){
    stop(paste0("Sequence and/or germline lengths of clone ",
                clone@clone," are not equal"))
  }
  if(data_type=="DNA"){
    data <- phangorn::phyDat(ape::as.DNAbin(t(as.matrix(dplyr::bind_rows(seqs)))))
  }else{
    data <- phangorn::phyDat(ape::as.AAbin(t(as.matrix(dplyr::bind_rows(seqs)))),
                             type="AA")
  }
  if(is.null(tree)){
    tree <- tryCatch(phangorn::pratchet(data,trace=FALSE),warning=function(w)w)
    tree <- phangorn::acctran(ape::multi2di(tree,random=resolve_random),data)
    tree <- ape::unroot(tree)
    tree$edge.length <- tree$edge.length/nchar(germline)
    tree$tree_method <- "phangorn::prachet"
    tree$edge_type <- "genetic_distance"
    nnodes <- length(unique(c(tree$edge[,1],tree$edge[,2])))
    tree$nodes <- rep(list(sequence=NULL),times=nnodes)
    tree$node.label <- NULL
  }else if(is.null(tree$nodes)){
    nnodes <- length(unique(c(tree$edge[,1],tree$edge[,2])))
    tree$nodes <- rep(list(sequence=NULL),times=nnodes)
  }
  tree$name <- clone@clone
  tree$seq <- seq
  if(asr != "none" && data_type=="DNA"){
    seqs_pars <- phangorn::ancestral.pars(tree, data, 
                                          type=asr_type, cost=NULL, return="prob")
    ASR <- list()
    for(i in 1:max(tree$edge)){
      patterns <- t(subset(seqs_pars, i)[[1]])
      pat <- patterns[,attr(seqs_pars,"index")]
      if(asr == "seq"){
        thresh <- pat > asr_thresh
        acgt <- c("A","C","G","T")
        seq_ar <- unlist(lapply(1:ncol(pat),function(x){
          site <- acgt[thresh[,x]]
          site <- alakazam::DNA_IUPAC[[paste(sort(site),collapse="")]]
          if(length(site) == 0){
            site <- "N"
          }
          site}))
        ASR[[as.character(i)]] <- paste(seq_ar,collapse="")
      }else{
        ASR[[as.character(i)]] <- pat
      }
    }
    tree$nodes <- lapply(1:length(tree$nodes),function(x){
      tree$nodes[[x]]$sequence <- ASR[[x]]
      tree$nodes[[x]]
    })
  }
  opars <- phangorn::parsimony(ape::di2multi(tree),data)
  tree <- rerootTree(tree,"Germline",verbose=0)
  npars <- phangorn::parsimony(ape::di2multi(tree),data)
  if(npars != opars){
    stop(paste("Error in rerooting tree",tree$name,
               "parsimony score not consistent:",opars,npars))
  }
  return(tree)
}


#' Wrapper for phangorn::optim.pml
#' 
#' @param    clone      \code{airrClone} object
#' @param    seq        sequece column in \code{airrClone} object
#' @param    sub_model  substitution model to use
#' @param    gamma      gamma site rate variation?
#' @param    asr        return sequence or probability matrix?
#' @param    asr_thresh threshold for including a nucleotide as an alternative
#' @param    tree       fixed tree topology if desired.
#' @param    data_type  Are sequences DNA or AA?
#' @param    verbose    Print error messages as they happen?
#' @param    optNni     Optimize tree topology
#' @param    optQ       Optimize Q matrix
#' @param    resolve_random  randomly resolve polytomies?
#' @param    quiet           amount of rubbish to print to console
#' @param    rep             current bootstrap replicate (experimental)
#'
#' @return  \code{phylo} object created by phangorn::optim.pml with nodes
#'          attribute containing reconstructed sequences.
#' @export
buildPML <- function(clone, seq="sequence", sub_model="GTR", gamma=FALSE, asr="seq", 
                     asr_thresh=0.05, tree=NULL, data_type="DNA", optNni=TRUE, optQ=TRUE, 
                     verbose=FALSE, resolve_random=TRUE, quiet=0, rep=NULL){
  seqs <- clone@data[[seq]]
  names <- clone@data$sequence_id
  if(length(seqs) < 2){
    stop(paste0(clone@clone," has only one sequence, skipping"))
  }
  if(seq == "hlsequence"){
    germline <- clone@hlgermline
  }else if(seq == "lsequence"){
    germline <- clone@lgermline
  }else{
    germline <- clone@germline
  }
  seqs <- base::append(seqs,germline)
  names <- c(names,"Germline")
  seqs <- strsplit(seqs,split="")
  names(seqs) <- names
  if(data_type=="DNA"){
    data <- phangorn::phyDat(ape::as.DNAbin(t(as.matrix(dplyr::bind_rows(seqs)))))
  }else{
    data <- phangorn::phyDat(ape::as.AAbin(t(as.matrix(dplyr::bind_rows( seqs)))),
                             type="AA")
    if(sub_model == "GTR"){
      warning("GTR model shouldn't be used for AA.")
    }
  }
  if(is.null(tree)){
    dm  <- phangorn::dist.ml(data)
    treeNJ  <- ape::multi2di(phangorn::NJ(dm), random=resolve_random)
    treeNJ$edge.length[treeNJ$edge.length < 0] <- 0 #change negative edge lengths to zero
    pml <- phangorn::pml(ape::unroot(treeNJ),data=data)
    fit <- tryCatch(phangorn::optim.pml(pml, model=sub_model, optNni=optNni, optQ=optQ,
                                        optGamma=gamma, rearrangement="NNI",control=phangorn::pml.control(epsilon=1e-08,
                                        maxit=10, trace=0)), error=function(e)e)
    if("error" %in% class(fit)){
      if(verbose){
        print(fit)
      }
      return(fit)
    }
    #tree <- ape::unroot(ape::multi2di(fit$tree)) #CGJ 4/5/23
    # this assumes we can change tree object without affecting ASR
    fit$tree <- ape::unroot(ape::multi2di(fit$tree))
    tree <- fit$tree
    # test if the tree is binary 
    if(!ape::is.binary(tree)){
      stop(paste("Tree may not be full resolved at", clone@clone))
    }
    tree$tree_method <- paste("phangorn::optim.pml::",sub_model)
    tree$edge_type <- "genetic_distance"
    nnodes <- length(unique(c(tree$edge[,1],tree$edge[,2])))
    tree$nodes <- rep(list(sequence=NULL),times=nnodes)
  }
  tree$name <- clone@clone
  tree$seq <- seq

  if(asr != "none" && data_type=="DNA"){
    seqs_ml <- phangorn::ancestral.pml(fit,
                                       type="marginal",return="prob")
    ASR <- list()
    for(i in 1:max(tree$edge)){
      patterns <- t(subset(seqs_ml, i)[[1]])
      pat <- patterns[,attr(seqs_ml,"index")]
      if(asr == "seq"){
        thresh <- pat > asr_thresh
        acgt <- c("A","C","G","T")
        seq_ar <- unlist(lapply(1:ncol(pat),function(x){
          site <- acgt[thresh[,x]]
          site <- alakazam::DNA_IUPAC[[paste(sort(site),collapse="")]]
          if(length(site) == 0){
            site <- "N"
          }
          site}))
        ASR[[as.character(i)]] <- paste(seq_ar,collapse="")
      }else{
        ASR[[as.character(i)]] <- pat
      }
    }
    tree$nodes <- lapply(1:length(tree$nodes),function(x){
      tree$nodes[[x]]$sequence <- ASR[[x]]
      tree$nodes[[x]]
    })
  }
  tree <- rerootTree(tree,"Germline",verbose=0)
  return(tree)
}




#' Wrapper to build IgPhyML trees and infer intermediate nodes
#' 
#' @param    clone      list of \code{airrClone} objects
#' @param    igphyml    igphyml executable
#' @param    trees      list of tree topologies if desired
#' @param    nproc      number of cores for parallelization
#' @param    temp_path  path to temporary directory
#' @param    id         IgPhyML run id
#' @param    rseed      random number seed if desired
#' @param    quiet      amount of rubbish to print
#' @param    rm_files   remove temporary files?
#' @param    rm_dir     remove temporary directory?
#' @param    partition  How to partition omegas along sequences (see details)
#' @param    omega      omega parameters to estimate (see IgPhyML docs)
#' @param    optimize   optimize HLP rates (r), lengths (l), topology (t)
#' @param    motifs     motifs to consider (see IgPhyML docs)
#' @param    hotness    hotness parameters to estimate (see IgPhyML docs)
#' @param    rates      comma delimited list showing which omega-defined partitions
#'                      get a separate rate (e.g. omega=e,e rates=0,1).
#' @param    asrc       Intermediate sequence cutoff probability
#' @param    splitfreqs Calculate codon frequencies on each partition separately?
#' @param    ...        Additional arguments (not currently used)
#'
#' @details Partition options in rate order:
#' \itemize{
#'   \item  \code{single}: 1 omega for whole sequence
#'   \item  \code{cf}: 2 omegas, 1 for all CDRs and 1 for all FWRs
#'   \item  \code{hl}: 2 omegas, 1 for heavy and 1 for light chain
#'   \item  \code{hlf}: 3 omegas, 1 for heavy FWR, 1 for all CDRs, and 1 for light FWRs
#'   \item  \code{hlc}: 3 omegas, 1 for all FWRs, 1 for heavy CDRs, and 1 for light CDRs
#'   \item  \code{hlcf}: 4 omegas, 1 for each heavy FWR, 1 for heavy CDR, 1 for light FWR, and 1 for light CDR
#' }
#'
#' @return  \code{phylo} object created by igphyml with nodes attribute
#'          containing reconstructed sequences.
#' @export
buildIgphyml <- function(clone, igphyml, trees=NULL, nproc=1, temp_path=NULL, 
                         id=NULL, rseed=NULL, quiet=0, rm_files=TRUE, rm_dir=NULL, 
                         partition=c("single", "cf", "hl", "hlf", "hlc", "hlcf"),
                         omega=NULL, optimize="lr", motifs="FCH", hotness="e,e,e,e,e,e", 
                         rates=NULL, asrc=0.95, splitfreqs=FALSE, ...){
  
  warning("Dowser igphyml doesn't mask split codons!")
  partition <- match.arg(partition)
  
  valid_o <- c("r","lr","tlr")
  if(!optimize %in% valid_o){
    stop(paste("Invalid optimize specification, must be one of:",valid_o))
  }
  na_regions <- unlist(lapply(clone, function(x)sum(is.na(x@region)) > 0))
  if(sum(na_regions) > 0){
    exclude_clones <- unlist(lapply(clone[na_regions], function(x)x@clone))
    stop(paste("NA regions found in clones",paste(exclude_clones, collapse=","), 
               "remove before continuing"))
  }
  
  if(!is.null(omega)){
    os <- strsplit(omega,split=",")[[1]]
  } else {
    os <- NULL
  }
  file <- writeLineageFile(clone,trees,dir=temp_path,id=id,rep=id,empty=FALSE,
                           partition=partition, ...)
  if(length(os) != 2 && (partition == "cf" | partition == "hl")){
    warning("Omega parameter incompatible with partition, setting to e,e")
    omega = "e,e"
    os <- strsplit(omega,split=",")[[1]]
    if(partition == "hl" && is.null(rates)){
      rates = "0,1"
    }
  }
  if(length(os) != 3 && (partition == "hlc" | partition == "hlf")){
    warning("Omega parameter incompatible with partition, setting to e,e,e")
    omega = "e,e,e"
    os <- strsplit(omega,split=",")[[1]]
  }
  if(length(os) != 4 && (partition == "hlcf")){
    warning("Omega parameter incompatible with partition, setting to e,e,e,e")
    omega = "e,e,e,e"
    os <- strsplit(omega,split=",")[[1]]
    if(is.null(rates)){
      rates = "0,0,1,1"
    }
  }
  if(length(os) != 1 && (partition == "single")){
    warning("Omega parameter incompatible with partition, setting to e")
    omega = "e"
    os <- strsplit(omega,split=",")[[1]]
    #stop("Specified partition model not compatible with multiple omegas or rates")
  }
  if (!is.null(rates)) {
    if (length(os) != length(strsplit(rates, ",")[[1]])) {
      stop("Number of rates needs to equal the number of omegas")
    }
  }
  igphyml <- path.expand(igphyml)
  if(file.access(igphyml, mode=1) == -1) {
    stop("The file ", igphyml, " cannot be executed.")
  }
  logfile <- paste0(file,".log")
  log <- paste(">>",logfile)
  if(is.null(rseed)){
    rseed <- ""
  }else{
    rseed <- paste("--r_seed",rseed)
  }
  gyrep <- paste0(file,"_gyrep")
  if(!is.null(trees)){
    command <- paste("--repfile",file,"--outrep",gyrep,
                     "--threads",nproc,"-o lr -m GY --run_id gy",rseed,log)
  }else{
    command <- paste("--repfile",file,"--outrep",gyrep,
                     "--threads",nproc,"-o tlr -m GY --run_id gy",rseed,log)
  }
  params <- list(igphyml,command,stdout=TRUE,stderr=TRUE)
  if(quiet > 2){
    print(paste(params,collapse=" "))
  }
  status <- tryCatch(do.call(base::system2, params), error=function(e){
    print(paste("igphyml error, trying again: ",e));
    cat(paste(readLines(logfile),"\n"))
    return(e)
  }, warning=function(w){
    print(paste("igphyml warnings, trying again: ",w));
    cat(paste(readLines(logfile),"\n"))
    return(w)
  })
  if(length(status) != 0){
    status <- tryCatch(do.call(base::system2, params), error=function(e){
      print(paste("igphyml error, again! quitting: ",e));
      cat(paste(readLines(logfile),"\n"))
      stop()
    }, warning=function(w){
      print(paste("igphyml warnings, again! quitting: ",w));
      cat(paste(readLines(logfile),"\n"))
      stop()
    })
  }
  if(splitfreqs){
    splitf = "--splitfreqs"
  }else{
    splitf = ""
  }
  if(!is.null(rates)){
    ratestring = paste0("--rates ",rates)
  }else{
    ratestring = ""
  }
  command <- paste("--repfile",gyrep,
                   "--threads",nproc,"--omega",omega,"-o",optimize,"--motifs",motifs,
                   "--hotness",hotness,"-m HLP --run_id hlp --oformat tab --ASRc",asrc,
                   ratestring,splitf,rseed,log)
  params <- list(igphyml,command,stdout=TRUE,stderr=TRUE)
  if(quiet > 2){
    print(paste(params,collapse=" "))
  }
  status <- tryCatch(do.call(base::system2, params), error=function(e){
    print(paste("igphyml error, trying again: ",e));
    cat(paste(readLines(logfile),"\n"))
    return(e)
  }, warning=function(w){
    print(paste("igphyml warnings, trying again: ",w));
    cat(paste(readLines(logfile),"\n"))
    return(w)
  })
  if(length(status) != 0){
    status <- tryCatch(do.call(base::system2, params), error=function(e){
      print(paste("igphyml error, again! quitting: ",e));
      cat(paste(readLines(logfile),"\n") )
      stop()
    }, warning=function(w){
      print(paste("igphyml warnings, again! quitting: ",w));
      cat(paste(readLines(logfile),"\n"))
      stop()
    })
  }
  #trees <- readLineages(file=gyrep,run_id="hlp",type="asr")
  ofile <- file.path(temp_path,paste0(id,"_lineages_",id,
                                      "_pars.tsv_gyrep_igphyml_stats_hlp.tab"))
  results <- alakazam::readIgphyml(ofile,format="phylo",
                                   branches="distance")
  if(partition == "hl"){
    names(results$param) = gsub("fwr","heavy",names(results$param))
    names(results$param) = gsub("cdr","light",names(results$param))
  }else if(partition == "hlf"){
    names(results$param) = gsub("omega_1","omega_heavyfwr",names(results$param))
    names(results$param) = gsub("omega_2","omega_cdr",names(results$param))
    names(results$param) = gsub("omega_3","omega_lightfwr",names(results$param))
  }else if(partition == "hlc"){
    names(results$param) = gsub("omega_1","omega_fwr",names(results$param))
    names(results$param) = gsub("omega_2","omega_heavycdr",names(results$param))
    names(results$param) = gsub("omega_3","omega_lightcdr",names(results$param))
  }else if(partition == "hlcf"){
    names(results$param) = gsub("omega_1","omega_heavyfwr",names(results$param))
    names(results$param) = gsub("omega_2","omega_heavycdr",names(results$param))
    names(results$param) = gsub("omega_3","omega_lightfwr",names(results$param))
    names(results$param) = gsub("omega_4","omega_lightcdr",names(results$param))
  }
  ASR <- readFasta(file.path(temp_path, paste0(id,"_lineages_",id,"_pars_hlp_asr.fasta")))
  trees <- results$trees
  params <- results$param[-1,]
  for(i in 1:nrow(params)){
    clone_id <- params[i,]$clone
    trees[[i]]$name <- clone_id
    trees[[i]]$tip.label[which(trees[[i]]$tip.label 
                               == paste0(clone_id,"_GERM"))] <- "Germline"
    trees[[i]]$tree_method <- paste("igphyml::gy94,hlp19")
    trees[[i]]$edge_type <- "genetic_distance_codon"
    trees[[i]]$seq <- "sequence"
    trees[[i]]$parameters <- c(params[i,])
    # add sequences to internal nodes
    gline_id <- paste0(clone_id,"_GERM")
    ntips <- length(trees[[i]]$tip.label)
    nint  <- dplyr::n_distinct(trees[[i]]$edge[,1])
    nnodes <- ntips + nint
    if(nnodes != length(unique(c(trees[[i]]$edge[,1],
                                 trees[[i]]$edge[,2])))){
      stop(paste("Internal node count error, clone ",clone_id))
    }

    if("rate_light_mle" %in% names(params)){
      #re-scale branches if separate rate estimated
      clone_index <- which(sapply(clone, function(x)x@clone == clone_id))
      if(length(clone_index) != 1){
        stop(paste0("Clone index error: ",clone_id))
      }
      clone_obj <- clone[[clone_index]]
      lrate <- params[i,]$rate_light_mle
      hrate <- params[i,]$rate_heavy_mle
      heavy_sites <- sum(clone_obj@locus == "IGH")/3
      light_sites <- sum(clone_obj@locus != "IGH")/3
      l <- trees[[i]]$edge.length
      trees[[i]]$edge.length <- (l*hrate*heavy_sites + l*lrate*light_sites)/
      (heavy_sites + light_sites)
    }

    trees[[i]]$nodes <- rep(list(sequence=NULL),times=nnodes)
    # blank node should be MRCA, and all labels should be in ASR file
    labs <- trees[[i]]$node.label
    if(sum(labs == "") != 1 || sum(labs == gline_id) > 0){
      stop(paste("Error in reading node labels, clone",clone_id))
    }
    labs[labs == ""] <- gline_id
    if(sum(!labs %in% names(ASR)) != 0){
      stop(paste("Labels not in reconstructed clone",clone_id,":",
                 paste(labs[!labs %in% names(ASR)],collapse=", ")))
    }
    # also add tip sequences to tip nodes
    tipseqs <- clone[[i]]@data[[clone[[i]]@phylo_seq]]
    names(tipseqs) <- clone[[i]]@data$sequence_id
    if(clone[[i]]@phylo_seq == "sequence"){
      tipseqs <- c(tipseqs,"Germline"=clone[[i]]@germline)
    }else if(clone[[i]]@phylo_seq == "lsequence"){
      tipseqs <- c(tipseqs,"Germline"=clone[[i]]@lgermline)
    }else if(clone[[i]]@phylo_seq == "hlsequence"){
      tipseqs <- c(tipseqs,"Germline"=clone[[i]]@hlgermline)
    }else{
      stop(paste("phylo_seq not recognized",clone_id))
    }
    if(sum(!trees[[i]]$tip.label %in% names(tipseqs)) != 0 ||
       sum(!names(tipseqs) %in% trees[[i]]$tip.label) != 0){
      stop(paste("Tip sequences do not match in clone",clone_id))
    }
    trees[[i]]$nodes <- lapply(1:nnodes,function(x){
      if(x <= ntips){ # if a tip, just add sequence
        trees[[i]]$nodes[[x]]$sequence <- tipseqs[trees[[i]]$tip.label[x]]
        trees[[i]]$nodes[[x]]$id <- trees[[i]]$tip.label[x]
      }else{ # if internal node, add reconstructed sequence
        trees[[i]]$nodes[[x]]$sequence <- ASR[[labs[x-ntips]]]
        trees[[i]]$nodes[[x]]$id <- labs[x-ntips]
        if(labs[x-ntips] == gline_id){trees[[i]]$nodes[[x]]$id <- "Germline_Inferred"}
      }
      trees[[i]]$nodes[[x]]
    })
  }

  
  if(rm_files){
    lines <- readLines(file)
    for(i in 2:length(lines)){
      temp <- strsplit(lines[i],split="\t")[[1]]
      unlink(paste0(temp[1],"*"))
      unlink(paste0(temp[2],"*"))
      if(temp[3] != "N"){
        unlink(paste0(temp[3],"*"))
      }
    }
    unlink(paste0(file,"*"))
    unlink(file.path(temp_path,paste0(id,"_lineages_",id,
                                      "_pars_hlp_asr.fasta")))
  }
  if(!is.null(rm_dir)){
    if(quiet > 1){
      print(paste("rming dir",rm_dir))
    }
    unlink(rm_dir,recursive=TRUE)
  }
  return(trees)
}


#' Wrapper to build RAxML-ng trees and infer intermediate nodes
#' 
#' @param    clone      list of \code{airrClone} objects
#' @param    seq        the phylo_seq option does this clone uses. Possible options are "sequence", "hlsequence", or "lsequence"
#' @param    exec       RAxML-ng executable
#' @param    model      The DNA model to be used. GTR is the default.
#' @param    partition  A parameter that determines how branches are reported when partitioning. Options include NULL (default), 
#'                      scaled, unlinked, and linked
#' @param    rseed      The random seed used for the parsimony inferences. This allows you to reproduce your results.
#' @param    name       specifies the name of the output file
#' @param    starting_tree specifies a user starting tree file name and path in Newick format
#' @param    from_getTrees A logical that indicates if the desired starting tree is from getTrees and not a newick file
#' @param    rm_files   remove temporary files?
#' @param    asr        computes the marginal ancestral states of a tree
#' @param    rep        Which repetition of the tree building is currently being run. Mainly for getBootstraps. 
#' @param    dir        Where the output files are to be made. 
#' @param    ...        Additional arguments (not currently used)
#'
#'
#' @return  \code{phylo} object created by RAxML-ng with nodes attribute
#'          containing reconstructed sequences.
#' @export
buildRAxML <- function(clone, seq = "sequence", exec, model = 'GTR', partition = NULL, 
                       rseed = 28, name = "run", starting_tree = NULL, 
                       from_getTrees = FALSE, rm_files = TRUE, asr = TRUE, rep = 1, dir = NULL, ...){
  exec <- path.expand(exec)
  if(file.access(exec, mode=1) == -1) {
    stop("The file ", exec, " cannot be executed.")
  }
  version_test <- grepl("PTHREADS", exec)
  if(version_test){
    stop("Dowser currently only supports the raxmlHPC based options. Please reinitate the function using the 'raxmlHPC' based executable.")
  }
  version_test <- grepl("-ng", exec)
  if(!version_test){
    stop("Please use raxml-ng, not an older version of RAxML.")
  }
  if(!is.null(partition)){
    if(seq != "hlsequence"){
      stop("RAxML partition models are currently only supported for heavy and light chain partitions. Please include clones that contain both.")
    }
  }
  if(!is.null(dir)){
    dir <- path.expand(dir)
    if(!dir.exists(dir)){
      dir.create(dir)
    }
  }else{
    dir <- alakazam::makeTempDir(name)
  }
  clone_seqids <- clone@data$sequence_id
  clone_seqids[length(clone_seqids)+1] <- "Germline"
  if(seq == "hlsequence"){
    clone_seqs <- clone@data$hlsequence
    g <- clone@hlgermline
  }else if(seq == "sequence"){
    clone_seqs <- clone@data$sequence
    g <- clone@germline
  }else if(seq == "lsequence"){
    clone_seqs <- clone@data$lsequence
    g <- clone@lgermline
  }else{
    stop(paste(seq, "not a recognized sequence type"))
  }
  clone_seqs[length(clone_seqs)+1] <- g
  
  # check to make sure that the adjusted sequences are the same length
  nchar_seqs <- nchar(clone_seqs)
  if(nrow(data.frame(table(nchar_seqs))) > 1){
    stop("Sequence lengths do not match.")
  }
  if(length(clone_seqids) != length(clone_seqs)){
    stop("The number of sequences does not match the number of sequence ids.")
  }
  if(!is.numeric(rseed)){
    stop("The random seed needs to be a numeric value")
  }
  
  name <- paste0(name, "_", clone@clone, "_", rep)
  # create the data file for raxml
  fileConn<-file(file.path(dir, paste0(name, "_input_data.phy")))
  writeLines(c(paste0(as.character(length(clone_seqs)), " ", 
                      as.character(nchar(clone_seqs[1])))), fileConn)
  for(i in 1:length(clone_seqs)){
    write(paste0(clone_seqids[i], "    ", clone_seqs[i]), file=file.path(dir, paste0(name, "_input_data.phy")), 
          append = TRUE)
  }
  close(fileConn)
  input_data <- file.path(dir, paste0(name, "_input_data.phy"))
  
  command <- paste("--model", model, "--seed", rseed, "-msa", 
                   input_data, "-prefix", paste0(dir,"/", name), "--threads 1",
                   "--force msa")
  if(!is.null(starting_tree)){
    if(from_getTrees){
      ape::write.tree(starting_tree, file.path(dir, paste0(name, "_og_starting_tree.tree")))
      starting_tree <- ape::read.tree(file.path(dir, paste0(name, "_og_starting_tree.tree")))
    }
    command <- paste(command, "--tree", starting_tree)
  }
  if(!is.null(partition)){
    heavy_index <- clone@locus == "IGH"
    end_heavy <- paste(strsplit(clone_seqs,split="")[[1]][heavy_index], collapse = "")
    fileConn<-file(file.path(dir, paste0(name, "_partition.txt")))
    write(paste0(model, ", p1 = 1-", nchar(end_heavy)), file=file.path(dir, paste0(name, "_partition.txt")), 
          append = TRUE)
    write(paste0(model, ", p2 = ", nchar(end_heavy)+1, "-", nchar(clone_seqs[1])), 
          file=file.path(dir, paste0(name, "_partition.txt")), 
          append = TRUE)
    close(fileConn)
    old_command <- strsplit(command, "--seed")[[1]][2]
    new_model <- paste("--model", file.path(dir, paste0(name, "_partition.txt")), "--seed")
    command <- paste0(new_model, old_command, " --brlen ", partition)
  }
  
  params <- list(exec, command, stdout=TRUE, stderr=TRUE)
  
  status <- tryCatch(do.call(base::system2, params), error=function(e){
    print(paste("RAxML error, trying again: ",e));
    return(e)
  }, warning=function(w){
    print(paste("RAxML warnings, trying again: ",w));
    return(w)
  })
  
  if(length(status) == 0){
    stop("RAxML was not run. Check input data.")
  }
  
  # check if there is a reduced file -- if so the file creation step failed
  if(file.exists(file.path(dir,paste0(name, "_input_data.phy.reduced")))){
    stop("Preprocessing broke. Not all noninformative sites by RAxML's definition were removed.")
  }
  
  if(asr){
    tree <- rerootTree(ape::unroot(ape::read.tree(file.path(dir,paste0(name, ".raxml.bestTree")))), "Germline", verbose = 0)
    starting_tree <- file.path(dir, paste0(name, "_rerooted.tree"))
    ape::write.tree(tree, starting_tree)
    command <- paste("--model", model, "--seed", rseed, "-msa", 
                     input_data, "-prefix", paste0(dir,"/", name, "_asr"), "--threads 1",
                     "--force msa --tree", starting_tree, "--ancestral")
    if(!is.null(partition)){
      old_command <- strsplit(command, "--seed")[[1]][2]
      new_model <- paste("--model", file.path(dir, paste0(name, "_partition.txt")), "--seed")
      command <- paste0(new_model, old_command, " --brlen ", partition)
    }
    
    params <- list(exec, command, stdout=TRUE, stderr=TRUE)
    status <- tryCatch(do.call(base::system2, params), error=function(e){
      print(paste("RAxML error, trying again: ",e));
      return(e)
    }, warning=function(w){
      print(paste("RAxML warnings, trying again: ",w));
      return(w)
    })
    
    # check that topology is the same
    asr_tree <- ape::read.tree(file.path(dir, paste0(name, "_asr.raxml.ancestralTree")))
    difference_check <- phangorn::RF.dist(asr_tree, tree)
    if(difference_check > 0){
      stop("ASR failed. Retry")
    }
    
    # use the asr_tree from here on out
    tree <- asr_tree
    results <- list()
    results$clone <- clone@clone
    results$nseq <- length(clone@data[[seq]]) 
    results$nsite <- nchar(clone@data[[seq]][1]) 
    results$tree_length <- sum(tree$edge.length)
    if(!is.null(partition) && partition == "unlinked"){
      p_trees <- ape::read.tree(file.path(dir, paste0(name, ".raxml.bestPartitionTrees")))
      p1 <- sum(p_trees[[1]]$edge.length)
      p2 <- sum(p_trees[[2]]$edge.length)
      tree_length <- c(paste("Best Tree:", results$tree_length), paste("Heavy Chain Tree:", p1),
                       paste("Light Chain Tree:", p2))
      results$tree_length <- tree_length
    }
    bestmodel <- readLines(file.path(dir, paste0(name, ".raxml.bestModel")))
    likelihood <- readLines(file.path(dir, paste0(name, "_asr.raxml.log")))
    results$likelihood <- as.numeric(strsplit(likelihood[grep("final logLikelihood:", likelihood)],
                                              "final logLikelihood: ")[[1]][2])
    results$model <- bestmodel
    tree$parameters <- results
    # get the ASR for the nodes
    nnodes <- length(unique(c(tree$edge[,1],tree$edge[,2])))
    tree$nodes <- rep(list(sequence=NULL),times=nnodes)
    tipseqs <- clone@data[[clone@phylo_seq]]
    names(tipseqs) <- clone@data$sequence_id
    if(clone@phylo_seq == "sequence"){
      tipseqs <- c(tipseqs,"Germline"=clone@germline)
    }else if(clone@phylo_seq == "lsequence"){
      tipseqs <- c(tipseqs,"Germline"=clone@lgermline)
    }else if(clone@phylo_seq == "hlsequence"){
      tipseqs <- c(tipseqs,"Germline"=clone@hlgermline)
    }else{
      stop(paste("phylo_seq not recognized",clone@clone))
    }
    if(sum(!tree$tip.label %in% names(tipseqs)) != 0 ||
       sum(!names(tipseqs) %in% tree$tip.label) != 0){
      stop(paste("Tip sequences do not match in clone",clone))
    }
    # add in the ASR sequences 
    # get the tree
    written_tree <- readLines(file.path(dir, paste0(name, "_asr.raxml.ancestralTree")))
    # split the tree by node input to get the order for internal nodes
    node_order <- strsplit(written_tree, "Node")
    full_order <- c()
    for(i in 2:length(node_order[[1]])){
      tobind <- gsub("\\:.*", "", node_order[[1]][i])
      tobind <- gsub("\\;", "", tobind)
      full_order <- append(full_order, tobind)
    }
    asr_seqs <- readLines(file.path(dir, paste0(name, "_asr.raxml.ancestralStates")))
    asr_order <- strsplit(asr_seqs, "Node")
    full_asr <- c()
    for(i in 1:length(asr_order)){
      tobind <- gsub("\\:.*", "", asr_order[[i]][2])
      num_asr <- strsplit(tobind, "\t")
      num_asr <- data.frame(node_num = num_asr[[1]][1], asr_seq = num_asr[[1]][2])
      full_asr <- rbind(full_asr, num_asr)
    }
    # add the ASR seq by whichever node is called in the tree
    for(i in 1:length(full_order)){
      node_num <- full_order[i]
      asr_seq <- full_asr$asr_seq[full_asr$node_num == node_num]
      tree$nodes[[(ape::Ntip(tree)+i)]]$sequence <- asr_seq
    }
    
    for(i in 1:length(tree$tip.label)){
      tip <- tree$tip.label[i]
      if(tip != "Germline"){
        seq_num <- which(clone@data$sequence_id == tip)
        if(seq == "hlsequence"){
          tree$nodes[i]$sequence <- clone@data$hlsequence[seq_num]
        } else if(seq == "sequence"){
          tree$nodes[i]$sequence <- clone@data$sequence[seq_num]
        }else{
          tree$nodes[i]$sequence <- clone@data$lsequence[seq_num]
        }
      }else{
        if(seq == "hlsequence"){
          tree$nodes[i]$sequence <- clone@hlgermline
        } else if(seq == "sequence"){
          tree$nodes[i]$sequence <- clone@germline
        }else{
          tree$nodes[i]$sequence <- clone@lgermline
        }
      }
    }
    tree <- rerootTree(tree, "Germline", verbose=0)
  }else {
    tree <- rerootTree(ape::unroot(ape::read.tree(file.path(dir,paste0(name, ".raxml.bestTree")))), "Germline", verbose = 0)
    tree$germid <- paste0(clone@clone, "_GERM")
    results <- list()
    results$clone <- clone@clone
    results$nseq <- length(clone@data[[seq]]) 
    results$nsite <- nchar(clone@data[[seq]][1])
    results$tree_length <- sum(tree$edge.length) 
    nnodes <- length(unique(c(tree$edge[,1],tree$edge[,2])))
    tree$nodes <- rep(list(sequence=NULL),times=nnodes)
    if(!is.null(partition) && partition == "unlinked"){
      p_trees <- ape::read.tree(file.path(dir, paste0(name, ".raxml.bestPartitionTrees")))
      p1 <- sum(p_trees[[1]]$edge.length)
      p2 <- sum(p_trees[[2]]$edge.length)
      tree_length <- c(paste("Best Tree:", results$tree_length), paste("Heavy Chain Tree:", p1),
                       paste("Light Chain Tree:", p2))
      results$tree_length <- tree_length
      for(i in 1:length(tree$tip.label)){
        tip <- tree$tip.label[i]
        if(tip != "Germline"){
          seq_num <- which(clone@data$sequence_id == tip)
          if(seq == "hlsequence"){
            tree$nodes[i]$sequence <- clone@data$hlsequence[seq_num]
          } else if(seq == "sequence"){
            tree$nodes[i]$sequence <- clone@data$sequence[seq_num]
          }else{
            tree$nodes[i]$sequence <- clone@data$lsequence[seq_num]
          }
        }else{
          if(seq == "hlsequence"){
            tree$nodes[i]$sequence <- clone@hlgermline
          } else if(seq == "sequence"){
            tree$nodes[i]$sequence <- clone@germline
          }else{
            tree$nodes[i]$sequence <- clone@lgermline
          }
        }
      }
    }
    bestmodel <- readLines(file.path(dir, paste0(name, ".raxml.bestModel")))
    likelihood <- readLines(file.path(dir, paste0(name, ".raxml.log")))
    results$likelihood <- as.numeric(strsplit(likelihood[grep("Final LogLikelihood:", likelihood)],
                                              "Final LogLikelihood: ")[[1]][2])
    results$model <- bestmodel
    tree$parameters <- results
    tipseqs <- clone@data[[seq]]
    names(tipseqs) <- clone@data$sequence_id
    if(seq == "sequence"){
      tipseqs <- c(tipseqs,"Germline"=clone@germline)
    }else if(seq == "lsequence"){
      tipseqs <- c(tipseqs,"Germline"=clone@lgermline)
    }else if(seq == "hlsequence"){
      tipseqs <- c(tipseqs,"Germline"=clone@hlgermline)
    }else{
      stop(paste("phylo_seq not recognized",clone))
    }
    if(sum(!tree$tip.label %in% names(tipseqs)) != 0 ||
       sum(!names(tipseqs) %in% tree$tip.label) != 0){
      stop(paste("Tip sequences do not match in clone",clone@clone))
    }
    for(i in 1:nnodes){
      if(i <= length(tree$tip.label)){
        tree$nodes[i]$sequence <- tipseqs[tree$tip.label[i]]
      }
    }
  }
  tree$name <- clone@clone
  tree$seq <- clone@phylo_seq
  tree_method <- readLines(file.path(dir, paste0(name, ".raxml.log")))
  tree$tree_method <- paste0("RAxML:", strsplit(tree_method[grep("Model: ", tree_method)],
                               "Model: ")[[1]][2])
  tree$edge_type <- "genetic_distance"
  if(rm_files){
    unlink(file.path(dir, paste0(name,"*")))
  }
  return(tree)
}


#' Reroot phylogenetic tree to have its germline sequence at a zero-length branch
#' to a node which is the direct ancestor of the tree's UCA. Assigns \code{uca}
#' to be the ancestral node to the tree's germline sequence, as \code{germid} as
#' the tree's germline sequence ID.
#'
#' @param   tree      An ape \code{phylo} object
#' @param   germline  ID of the tree's predicted germline sequence
#' @param   min       Maximum allowed branch length from germline to root
#' @param   verbose    amount of rubbish to print
#' @return  \code{phylo} object rooted at the specified germline
#' 
#' @export
rerootTree <- function(tree, germline, min=0.001, verbose=1){
  ntip <- length(tree$tip.label)
  uca <- ntip+1
  if(!germline %in% tree$tip.label){
    stop(paste(germline,"not found in tip labels!"))
  }
  olength <- sum(tree$edge.length)
  odiv <- ape::cophenetic.phylo(tree)[germline,]
  if(ape::is.rooted(tree)){
    if(verbose > 0){
      print("unrooting tree!")
    }
    root <- ape::getMRCA(tree,
                         tip=tree$tip.label)
    max <- max(tree$edge)
    rindex <- tree$edge[,1] == root
    redge <- tree$edge[rindex,]
    parent <- redge[1,2]
    child <- redge[2,2]
    if(parent <= length(tree$tip.label)){
      parent <- redge[2,2]
      child <- redge[1,2]
    }
    if(tree$tip.label[child] == germline &&
       tree$edge.length[tree$edge[,1] == root &
                        tree$edge[,2] == child] <= min){
      if(verbose > 0){
        print("tree already rooted at germline!")
      }
      return(tree)
    }
    warning("Rooting already rooted trees not fully supported.")
    # if tree rooted somewhere besides the germline
    # cut out the root node and directly attach
    # its former descendants to each other.
    tree$edge <- tree$edge[!rindex,]
    tree$edge <- rbind(tree$edge,c(parent,child))
    sumedge <- sum(tree$edge.length[rindex])
    tree$edge.length <- tree$edge.length[!rindex]
    tree$edge.length[length(tree$edge.length)+1] <- sumedge
    tree$Nnode <- tree$Nnode - 1
    if(parent != uca){
      #if new parent doesn't have correct uca number, swap it
      #uca node should have been the node that was cut out
      #if not the parent
      if(uca %in% tree$edge){
        stop("something weird happened during unrooting")
      }
      root <- parent
      tree$edge[tree$edge[,1]==parent,1] <- uca
      tree$edge[tree$edge[,2]==parent,2] <- uca
      if(!is.null(tree$nodes)){
        s <- tree$nodes[[uca]]
        tree$nodes[[uca]] <- tree$nodes[[parent]]
        tree$nodes[[parent]] <- s
      }
    }
    # set the node information for the appropriate root
    tree$edge[tree$edge[,1]==max,1] <- root
    tree$edge[tree$edge[,2]==max,2] <- root
    if(!is.null(tree$nodes)){
      tree$nodes[[max]] <- tree$nodes[[root]]
      tree$nodes[[max]] <- NULL
    }
  }
  #pairwise patristic distance among all nodes
  odist <- ape::dist.nodes(tree)
  
  edge <- tree$edge
  germid <- which(tree$tip.label == germline)
  
  max <- max(edge)
  nnode <- max+1 #new node number
  uca <- ntip+1 #uca needs to be first internal node
  
  # replace current uca (hereafter mrca) with new node number in edge list
  edge[edge[,1]==uca,1] <- nnode
  edge[edge[,2]==uca,2] <- nnode
  
  # make MRCA connect to new UCA node instead of germline
  edge[edge[,2] == germid,2] <- uca
  
  #add 0 length edge from UCA to germline
  edge <- rbind(edge,c(uca,germid)) 
  tree$edge.length[length(tree$edge.length)+1] <- 0

  # recursive function to swap nodes while
  # preserving internal node ids
  # this is what actually reroots the trees
  swap <- function(tnode, edge, checked){
    if(tnode %in% checked){
      print("r edge")
      return(edge)
    }
    checked <- c(checked,tnode)
    children <- edge[edge[,1] == tnode,2] #get children of this node
    parent <- edge[edge[,2] == tnode,1] #get parent of this node
    # only one child node, or if parent hasn't been checked, swap target and parent
    if(length(children) < 2 || sum(!parent %in% checked) > 0){
      parent <- edge[edge[,2] == tnode,1]
      parent <- parent[!parent %in% checked]
      edge[edge[,1] == parent & edge[,2] == tnode,] <- c(tnode,parent)
      children <- edge[edge[,1] == tnode,2]
    }
    #make sure descendant branches are facing the correct direction
    for(tnode in children){
      if(!tnode %in% checked){
        edge <- swap(tnode,edge,checked)
      }
    }
    return(edge)
  }
  
  #place new UCA node at root of tree
  edge <- swap(uca, edge, checked=c(1:ntip))
  
  tree$edge <- edge
  tree$Nnode <- length(unique(edge[,1]))
  tree <- ape::reorder.phylo(tree,"postorder")
  
  # swap node information between mrca and old uca
  # uca gets same info as germline
  if(!is.null(tree$nodes)){
    tree$nodes[[nnode]] <- tree$nodes[[uca]]
    tree$nodes[[uca]] <- tree$nodes[[germid]]
  }
  
  # reset and re-order tree
  attr(tree, "order") <- NULL
  tree <- ape::ladderize(tree, right=FALSE)
  
  # sanity check tree length, divergence, and internal node distances
  nlength <- sum(tree$edge.length)
  ndiv <- ape::cophenetic.phylo(tree)[germline,]
  if(abs(nlength - olength) > 0.001){
    stop(paste("Error in rerooting tree",tree$name,
               "tree length not consistent"))
  }
  divergence_diff <- odiv - ndiv[names(odiv)]
  if(max(abs(divergence_diff)) > 0.001){
    stop(paste("Error in rerooting tree",tree$name,
               "germline divergences not consistent"))
  }
  ndist <- ape::dist.nodes(tree)
  ndist[uca,] <- ndist[nnode,]
  ndist[,uca] <- ndist[,nnode]
  ndist <- ndist[,-nnode]
  ndist <- ndist[-nnode,]
  max_diff <- max(odist - ndist)
  if(abs(max_diff) > 0.001){
    stop(paste("Error in rerooting tree",tree$name,
               "node distances not consistent"))
  }
  return(tree)
}

#' Estimate lineage tree topologies, branch lengths,
#' and internal node states if desired
#'
#' \code{getTrees} Tree building function.
#' @param    clones     a tibble of \code{airrClone} objects, the output of
#'                      \link{formatClones}
#' @param    trait      trait to use for parsimony models (required if
#'                      \code{igphyml} specified)
#' @param    build      program to use for tree building (pratchet, pml,
#'                      dnapars, dnaml, igphyml)
#' @param    exec       location of desired phylogenetic executable
#' @param    igphyml    optional location of igphyml executible for parsimony
#' @param    id         unique identifer for this analysis (required if
#'                      \code{igphyml} or \code{dnapars} specified)
#' @param    dir        directory where temporary files will be placed.
#' @param    modelfile  file specifying parsimony model to use
#' @param    fixtrees   if TRUE, use supplied tree topologies
#' @param    nproc      number of cores to parallelize computations
#' @param    quiet      amount of rubbish to print to console
#' @param    rm_temp    remove temporary files (default=TRUE)
#' @param    palette    deprecated
#' @param    seq        column name containing sequence information
#' @param    collapse   Collapse internal nodes with identical sequences?
#' @param    ...        Additional arguments passed to tree building programs
#'
#' @return   A list of \code{phylo} objects in the same order as \code{data}.
#'
#' @details
#' Estimates phylogenetic tree topologies and branch lengths for a list of
#' \code{airrClone} objects. By default, it will use phangnorn::pratchet to
#' estimate maximum parsimony tree topologies, and ape::acctran to estimate
#' branch lengths. If \code{igpyhml} is specified, internal node \code{trait}
#' values will be predicted by maximum parsimony. In this case, \code{dir} will
#' need to be specified as a temporary directory to place all the intermediate
#' files (will be created if not available). Further, \code{id} will need to
#' specified to serve as a unique identifier for the temporary files. This
#' should be chosen to ensure that multiple \code{getTrees} calls using the same
#' \code{dir} do not overwrite each others files.
#'
#' \code{modelfile} is written automatically if not specified, but doesn't
#' include any constraints. Intermediate files are deleted by default. This can
#' be toggled using (\code{rm_files}).
#'
#' For examples and vignettes, see https://dowser.readthedocs.io
#'
#' @seealso \link{formatClones}, \link{findSwitches}, \link{buildPhylo},
#' \link{buildPratchet}, \link{buildPML}, \link{buildIgphyml}
#' @examples
#' data(ExampleClones)
#'
#' trees <- getTrees(ExampleClones[10,])
#' plotTrees(trees)[[1]]
#'
#' \dontrun{
#' data(ExampleClones)
#'
#' trees <- getTrees(ExampleClones[10,],igphyml="/path/to/igphyml",
#'          id="temp",dir="temp", trait="sample_id")
#' plotTrees(trees)[[1]]
#' }
#' @export
getTrees <- function(clones, trait=NULL, id=NULL, dir=NULL,
                     modelfile=NULL, build="pratchet", exec=NULL, igphyml=NULL,
                     fixtrees=FALSE, nproc=1, quiet=0, rm_temp=TRUE, palette=NULL,
                     seq=NULL, collapse=FALSE, ...){

  if(is.null(exec) && (!build %in% c("pratchet", "pml"))){
    stop("exec must be specified for this build option")
  }
  if(!is.null(dir)){
    dir <- path.expand(dir)
  }

  data <- clones$data
  if(!inherits(data, "list")){
    data <- list(data)
  }
  if(!inherits(data[[1]], "airrClone")){
    stop("Input data must be a list of airrClone objects")
  }
  if(!is.null(palette)){
    warning("palette option is deprecated in getTrees, specify in plotTrees")
    palette <- NULL
  }

  # make sure all sequences and germlines within a clone are the same length
  unlist(lapply(data, function(x){
    if(x@phylo_seq == "hlsequence"){
      germline <- x@hlgermline
      seqs <- x@data$hlsequence
    }else if(x@phylo_seq == "lsequence"){
      germline <- x@lgermline
      seqs <- x@data$lsequence
    }else{
      germline <- x@germline
      seqs <- x@data$sequence
    }
    if(any(nchar(germline) != nchar(seqs))){
      stop(paste0("Sequence and/or germline lengths of clone ",
                  x@clone," are not equal."))
    }
  }))
  if(fixtrees){
    if(!"trees" %in% names(clones)){
      stop("trees column must be specified if fixtrees=TRUE")
    }
    if(!inherits(clones$trees[[1]], "phylo")){
      stop("Trees must be a list of class phylo")
    }
    trees <- clones$trees
  }else{
    trees <- NULL
  }
  if(is.null(id)){
    id <- "sample"
  }
  big <- FALSE
  if(sum(unlist(lapply(data, function(x)nrow(x@data)))) > 10000){
    big <- TRUE
  }
  if(!rm_temp && big){
    warning("Large dataset - best to set rm_temp=TRUE")
  }
  if(!is.null(igphyml)){
    igphyml <- path.expand(igphyml)
    if(!is.null(dir)){
      if(!dir.exists(dir)){
        dir.create(dir)
      }
    }else{
      dir <- alakazam::makeTempDir(id)
      if(big){
        warning("Large dataset - best to set dir and id params")
      }
    }
    if(is.null(trait)){
      stop("trait must be specified when igphyml-based trait reconstruction")
    }
    # remove problematic characters from trait values
    data <- lapply(data,function(x){
      x@data[[trait]] <- gsub(":|;|,|=| |_","-",x@data[[trait]])
      x})
    if(is.null(modelfile)){
      states <- unique(unlist(lapply(data,function(x)x@data[,trait])))
      modelfile <- makeModelFile(states,
                                 file=file.path(dir,paste0(id,"_modelfile.txt")))
    }else{
      states <- readModelFile(modelfile)
    }
    #if igphyml is specified, append trait value to sequence ids
    if(!is.null(trees)){
      indexes <- 1:length(data)
      trees <- lapply(indexes,function(x){
        tree <- trees[[x]]
        datat <- data[[x]]
        for(id in datat@data$sequence_id){
          trait_temp <- filter(datat@data,
                               !!rlang::sym("sequence_id")==id)[[trait]]
          tree$tip.label[tree$tip.label == id] <-
            paste0(id,"_",trait_temp)
        }
        tree})
    }
    data <- lapply(data,function(x){
      x@data$sequence_id <- paste0(x@data$sequence_id,"_",x@data[[trait]])
      x})
  }
  if(build=="dnapars" || build=="igphyml" || build=="dnaml" || !is.null(igphyml) || build=="raxml"){
    if(!is.null(dir)){
      if(!dir.exists(dir)){
        dir.create(dir)
      }
    }else{
      dir <- alakazam::makeTempDir(id)
      if(big){
        warning("Large dataset - best to set dir and id params")
      }
    }
  }

  if(!inherits(data, "list")){
    data <- list(data)
  }
  if(!is.null(dir)){
    if(!dir.exists(dir)){
      dir.create(dir)
    }
  }
  rm_dir <- NULL
  if(rm_temp){
    rm_dir=file.path(dir,paste0(id,"_recon_trees"))
  }

  reps <- as.list(1:length(data))
  if(is.null(seq)){
    seqs <- unlist(lapply(data,function(x)x@phylo_seq))
  }else{
    seqs <- rep(seq,length=length(data))
  }
  if(build=="dnapars" || build=="dnaml"){
    trees <- parallel::mclapply(reps,function(x)
      tryCatch(buildPhylo(data[[x]],
                          exec=exec,
                          temp_path=file.path(dir,paste0(id,"_trees_",x)),
                          rm_temp=rm_temp,seq=seqs[x],tree=trees[[x]],...),
               error=function(e)e),
      mc.cores=nproc)
  }else if(build=="pratchet"){
    trees <- parallel::mclapply(reps,function(x)
      tryCatch(buildPratchet(data[[x]],seq=seqs[x],
                             tree=trees[[x]],...),error=function(e)e),
      mc.cores=nproc)
  }else if(build=="pml"){
    trees <- parallel::mclapply(reps,function(x)
      tryCatch(buildPML(data[[x]],seq=seqs[x],
                        tree=trees[[x]],...),error=function(e)e),
      mc.cores=nproc)
  } else if(build=="igphyml"){
    if(rm_temp){
      rm_dir <- file.path(dir,id)
    }else{
      rm_dir <- NULL
    }
    trees <-
      tryCatch(buildIgphyml(data,
                            igphyml=exec,
                            temp_path=file.path(dir,id),
                            rm_files=rm_temp,
                            rm_dir=rm_dir,
                            trees=trees,nproc=nproc,id=id,...),error=function(e)e)
    if(inherits(trees, "error")){
      stop(trees)
    }
  } else if(build=="raxml"){ # CGJ 2/20/23
    trees <- parallel::mclapply(reps,function(x)
      tryCatch(buildRAxML(data[[x]], 
                          seq=seqs[x],
                          exec = exec,
                          rm_files = rm_temp,
                          dir=dir,
                          starting_tree=trees[[x]],...),error=function(e)e),
      mc.cores=nproc)
  } else{
    stop("build specification ", build, " not recognized")
  }

  # save points for data have been saved as the following
  # trees -> trees_save
  # data -> data_save
  # clones -> clones_save
  #catch any tree inference errors
  if(build != "igphyml"){
    errors <- unlist(lapply(trees, function(x) inherits(x, "error")))
    messages <- trees[errors]
    errorclones <- clones$clone_id[errors]
    trees <- trees[!errors]
    data <- data[!errors]
    clones <- clones[!errors,]
    if(length(errorclones) > 0){
      warning(paste("Tree building failed for clones",
                    paste(errorclones,collapse=", ")))
      me <- lapply(messages, function(x)warning(x$message))
    }
  }
  if(length(trees) == 0){
    stop("No trees left!")
  }

  # make sure trees, data, and clone objects are in same order
  tree_names <- unlist(lapply(trees, function(x)x$name))
  data_names <- unlist(lapply(data, function(x)x@clone))
  m <- match(tree_names, data_names)
  data <- data[m]
  m <- match(tree_names, clones$clone_id)
  clones <- clones[m,]

  if(build == "igphyml" | build == "raxml"){
    clones$parameters <- lapply(trees,function(x)x$parameters)
  }

  if(!is.null(igphyml)){
    file <- writeLineageFile(data=data, trees=trees, dir=dir,
                             id=id, trait=trait, rep="trees")

    mtrees <- reconIgPhyML(file, modelfile, igphyml=igphyml, 
                           mode="trees", id=NULL, quiet=quiet, nproc=nproc,
                           rm_files=rm_temp, rm_dir=rm_dir, states=states, 
                           palette=palette, ...)
    
    # remove trait value from tips
    mtrees <- lapply(mtrees,function(x){
      ids <- strsplit(x$tip.label,split="_")
      x$tip.label <- unlist(lapply(ids,function(t)
        paste(t[1:(length(t)-1)],collapse="_")))
      x$tip.label[x$tip.label == x$name] <- "Germline"
      x
    })
    
  }else{
    mtrees <- trees
  }
  # Sanity checks
  match <- unlist(lapply(1:length(data), function(x){
    data[[x]]@clone == mtrees[[x]]$name
  }))
  if(sum(!match) > 0){
    stop("Clone and tree names not in proper order!")
  }
  clones$trees <- mtrees
  if(collapse){
    clones <- collapseNodes(clones)
  }
  if(sum(clones$clone_id != 
         unlist(lapply(trees,function(x)x$name))) > 0){
    stop("Tree column names don't match clone IDs")
  }
  
  clones
}



#' Scale branch lengths to represent either mutations or mutations per site.
#' 
#' \code{scaleBranches} Branch length scaling function.
#' @param    clones      a tibble of \code{airrClone} and \code{phylo} objects,
#'                      the output of \link{getTrees}.
#' @param    edge_type  Either \code{genetic_distance} (mutations per site) or 
#'                      \code{mutations}
#' 
#' @return   A tibble with \code{phylo} objects that have had branch lengths 
#'           rescaled as specified.
#'
#' @details
#' Uses clones$trees[[1]]$edge_type to determine how branches are currently scaled.
#'  
#' @seealso \link{getTrees}
#' @export
scaleBranches <- function(clones, edge_type="mutations"){
  if(!"tbl" %in% class(clones)){
    print(paste("clones is of class",class(clones)))
    stop("clones must be a tibble of airrClone objects!")
  }else{
    if(!inherits(clones$data[[1]], "airrClone")){
      print(paste("clones is list of class",class(clones$data[[1]])))
      stop("clones must be a list of airrClone objects!")
    }
  }
  if(!"trees" %in% names(clones)){
    stop("clones must have trees column!")
  }
  lengths <- unlist(lapply(1:length(clones$trees),
                           function(x){
                             if(clones$data[[x]]@phylo_seq == "hlsequence"){
                               return(nchar(clones$data[[x]]@hlgermline))
                             }else if(clones$data[[x]]@phylo_seq == "lsequence"){
                               return(nchar(clones$data[[x]]@lgermline))
                             }else{
                               return(nchar(clones$data[[x]]@germline))
                             }}))
  
  trees <- lapply(1:length(clones$trees),function(x){
    if(clones$trees[[x]]$edge_type == "mutations" && 
       edge_type == "genetic_distance"){
      clones$trees[[x]]$edge.length <- 
        clones$trees[[x]]$edge.length/lengths[x]
      clones$trees[[x]]$edge_type <- "genetic_distance"
      clones$trees[[x]]
    }else if(clones$trees[[x]]$edge_type == "genetic_distance" &&
             edge_type == "mutations"){
      clones$trees[[x]]$edge.length <- 
        clones$trees[[x]]$edge.length*lengths[x]
      clones$trees[[x]]$edge_type <- "mutations"
      clones$trees[[x]]
        }else if(clones$trees[[x]]$edge_type == "genetic_distance_codon" &&
             edge_type == "mutations"){
      clones$trees[[x]]$edge.length <-
        clones$trees[[x]]$edge.length*lengths[x]/3
      clones$trees[[x]]$edge_type <- "mutations"
      clones$trees[[x]]
    }else{
      warning("edge conversion type not yet supported")
      clones$trees[[x]]
    }})
  
  clones$trees <- trees
  clones
}

#' Collapse internal nodes with the same predicted sequence
#' 
#' \code{collapseNodes} Node collapsing function.
#' @param    trees    a tibble of \code{airrClone} objects, the output of \link{getTrees}
#' @param    tips     collapse tips to internal nodes? (experimental)  
#' @param    check    check that collapsed nodes are consistent with original tree
#' 
#' @return   A tibble with \code{phylo} objects that have had internal nodes collapsed.
#'
#' @details
#' Use plotTrees(trees)[[1]] + geom_label(aes(label=node)) + geom_tippoint() to show
#' node labels, and getSeq to return internal node sequences
#'  
#' @seealso \link{getTrees}
#' @export
collapseNodes <- function(trees, tips=FALSE, check=TRUE){
  if(!"phylo" %in% class(trees)){
    trees$trees <- lapply(trees$trees,function(x)
      collapseNodes(x,tips))
    return(trees)
  }
  otrees <- trees
  edges <- trees$edge
  ttips <- trees$tip.label
  edge_l <- trees$edge.length
  btip <- length(trees$tip.label)
  if(is.null(trees$node.label)){
    trees$node.label <- rep("",length(unique(edges)))
  }
  maxiter <- 1000
  while(maxiter > 0){
    # edge list, plus whether or not reconstructed sequences are identical
    edges <- cbind(edges[,1:2],unlist(lapply(1:nrow(edges),function(x)
      trees$nodes[[edges[x,1]]]$sequence==trees$nodes[[edges[x,2]]]$sequence)))
    # get list of edges to collapse
    if(!tips){
      zedges <- edges[edges[,2] > btip & edges[,3] == 1,]
    }else{
      mrca <- ape::getMRCA(trees,tip=trees$tip.label)
      zedges <- edges[edges[,1] != mrca & edges[,3] == 1,]
    }
    if(length(zedges) > 0){
      if(!is.matrix(zedges)){
        dim(zedges) <- c(1,3)
      }
    }else{
      break
    }
    # replace identical nodes with smaller of the node numbers
    ms <- c() #smaller node numbers
    ns <- c() #larger node numbers
    for(i in 1:nrow(zedges)){
      m <- min(zedges[i,1:2])
      n <- max(zedges[i,1:2])
      edges[edges[,1] == n,1] <- m
      edges[edges[,2] == n,2] <- m
      ms <- c(ms,m)
      ns <- c(ns,n)
    }
    # remove edges that lead to their own node
    edge_l <- edge_l[edges[,1] != edges[,2]]
    edges <- edges[edges[,1] != edges[,2],]
    maxiter <- maxiter - 1
  }
  if(maxiter == 0){
    stop("Exceeded maximum node collapse iterations")
  }
  # make collapsed tips into internal nodes
  if(tips){
    tnodes <- unique(edges[edges[,1] <= btip,1])
    m <- max(edges)+1
    for(n in tnodes){ #replace tip number with new number
      edges[edges[,1] == n,1] <- m
      edges[edges[,2] == n,2] <- m
      trees$nodes[[m]] <- trees$nodes[[n]]
      trees$nodes[[m]]$id <- ttips[n]
      m <- m + 1
    }
    ttips <- ttips[-tnodes]
  }
  # make key of new node names
  nodes <- sort(unique(c(edges[,1],edges[,2])))
  nnodes <- 1:length(nodes)
  names(nnodes) <- nodes
  
  # replace old nodes with new node names and info
  nedges <- edges
  nsequences <- as.list(1:length(nodes))
  for(n in names(nnodes)){
    new <- nnodes[n]
    nedges[edges[,1] == as.numeric(n),1] <- new
    nedges[edges[,2] == as.numeric(n),2] <- new
    nsequences[[new]] <- trees$nodes[[as.numeric(n)]]
    if(is.null(nsequences[[new]]$id)){
      nsequences[[new]]$id <- ""
    }
  }
  trees$edge <- nedges[,1:2]
  trees$edge.length <- edge_l
  trees$nodes <- nsequences
  trees$Nnode <- length(nsequences)-length(ttips)
  trees$tip.label <- ttips
  nodelabs <- unlist(lapply(trees$nodes,function(x){
    if(is.null(x$id)){
      ""
    }else{
      x$id
    }}))
  trees$node.label <- nodelabs[(length(ttips)+1):length(nsequences)]
  
  # reset and re-order tree
  attr(trees, "order") <- NULL
  trees <- ape::ladderize(trees, right=FALSE)
  
  if(check){
    if(tips){
      warning("Cannot check nodes while collapsing tips (tips=TRUE)")
    }else{
      subt <- ape::subtrees(otrees)
      for(sub in subt){
        seqs <- sub$tip.label
        node <-  ape::getMRCA(otrees,tip=seqs)
        cnode <- ape::getMRCA(trees,tip=seqs)
        oseq <- otrees$nodes[[node]]$sequence
        nseq <- trees$nodes[[cnode]]$sequence
        if(oseq != nseq){
          stop(paste("Node",node,"in clone tree",trees$name,
                     "does not equal corresponding sequence in collapsed tree"))
        }
      }
    }
  }
  return(trees)
}

#' Return IMGT gapped sequence of specified tree node
#' 
#' \code{getNodeSeq} Sequence retrieval function.
#' @param    data    a tibble of \code{airrClone} objects, the output of 
#'                   \link{getTrees}
#' @param    node    numeric node in tree (see details)
#' @param    tree    a \code{phylo} tree object containing \code{node}
#' @param    clone   if \code{tree} not specified, supply clone ID in \code{data}
#' @param    gaps    add IMGT gaps to output sequences?
#' @return   A vector with sequence for each locus at a specified \code{node}
#'           in \code{tree}.
#'
#' @details
#' Use plotTrees(trees)[[1]] + geom_label(aes(label=node))+geom_tippoint() to show
#' node labels, and getNodeSeq to return internal node sequences
#'  
#' @seealso \link{getTrees}
#' @export
getNodeSeq <- function(data, node, tree=NULL, clone=NULL, gaps=TRUE){
  if(is.null(tree)){
    if(is.null(clone)){
      stop("must provide either tree object or clone ID")
    }
    tree <- dplyr::filter(data,!!rlang::sym("clone_id")==clone)$trees[[1]]
  }
  clone <- dplyr::filter(data,!!rlang::sym("clone_id")==tree$name)$data[[1]]
  seqs <- c()
  seq <- strsplit(tree$nodes[[node]]$sequence,split="")[[1]]
  loci <- unique(clone@locus)
  for(locus in loci){
    if(length(seq) < length(clone@locus)){
      warning("Sequences are shorter than chain vector. Exiting")
    }
    if(length(seq) > length(clone@locus)){
      stop("Sequences are longer than chain vector. Exiting")
    }
    lseq <- seq[clone@locus == locus]
    lseq[is.na(lseq)] <- "N"
    if(gaps){
      nums <- clone@numbers[clone@locus == locus]
      nseq <- rep(".",max(nums))
      nseq[nums] <- lseq
      lseq <- nseq
    }
    seqs <- c(seqs,paste(lseq,collapse=""))
  }
  names(seqs) <- loci
  return(seqs)
}

#' Deprecated! Use getNodeSeq
#' 
#' \code{getSeq} Sequence retrieval function.
#' @param    data    a tibble of \code{airrClone} objects, the output of
#'                   \link{getTrees}
#' @param    node    numeric node in tree (see details)
#' @param    tree    a \code{phylo} tree object containing \code{node}
#' @param    clone   if \code{tree} not specified, supply clone ID in \code{data}
#' @param    gaps    add IMGT gaps to output sequences?
#' @return   A vector with sequence for each locus at a specified \code{node}
#'           in \code{tree}.
#'
#' @seealso \link{getTrees}
#' @export
getSeq <- function(data, node, tree=NULL, clone=NULL, gaps=TRUE){
  
  warning("getSeq is depracated and will be removed. Use getNodeSeq instead.")
  
  return(getNodeSeq(data=data, node=node, tree=tree, clone=clone, gaps=gaps))
}

#' \code{downsampleClone} Down-sample clone to maximum tip/switch ratio
# TODO: Add support for weighting by collapseCount?
#' @param    clone       an \link{airrClone} object
#' @param    trait       trait considered for rarefaction
#'                       \link{getTrees}
#' @param    tree        a \code{phylo} tree object correspond to \code{clone}
#' @param    tip_switch  maximum tip/switch ratio
#' @return   A vector with sequence for each locus at a specified \code{node}
#'           in \code{tree}.
#' @export
downsampleClone <- function(clone, trait, tip_switch=20, tree=NULL){
  
  cdata <- clone@data
  if(!trait %in% names(cdata)){
    stop(paste(trait,"not found in clone data columns"))
  }
  states <- unique(cdata[[trait]])
  
  if(sum(is.na(states) > 0)){
    stop("NA trait values detected, must be removed before trait analysis.")
  }
  
  if(length(states) > 1){
    # if at least one of each state is preserved, there is a minimum
    # of length(states)-1 switches, and a minimum of length(states) tips
    if(tip_switch < length(states)/(length(states)-1)){
      stop(paste("clone",clone@clone,
                 "tip/switch ratio =",tip_switch,"not possible with",
                 length(states),"states"))
    }
    
    ntips <- nrow(cdata)
    
    # randomly select one sequence of each type
    saved <- unlist(lapply(states, function(x)
      sample(cdata[cdata[[trait]] == x,]$sequence_id,size=1)))
    
    # if too many tips, downsample
    if(ntips/(length(states)-1) > tip_switch){
      target <- tip_switch*(length(states)-1)
      tips <- cdata$sequence_id
      tips <- tips[!tips %in% saved]
      rm <- sample(tips, size=ntips-target)
      cdata <- cdata[!cdata$sequence_id %in% rm,]
      
      # check that results are good
      if(nrow(cdata)/(dplyr::n_distinct(cdata[[trait]])-1) != tip_switch){
        stop(paste("clone",clone@clone,"downsampling failed!"))
      }
      # if tree provided, drop selected tips
      if(!is.null(tree)){
        od <- getDivergence(tree)
        # CGJ 9/5/23 found via checkUsagePackage("dowser")
        # otree <- tree
        tree <- ape::drop.tip(tree, tip=rm)
        nd <- getDivergence(tree)
        maxdiff <- max(nd - od[names(nd)])
        meandiff <- mean(nd - od[names(nd)])
        if(maxdiff > 0.001){
          badtip <- names(which.max(nd - od[names(nd)]))
          warning(paste("clone",clone@clone,
                        "downsampling divergences differ by max",
                        signif(maxdiff,digits=2),"mean",
                        signif(meandiff,digits=2), "worst tip", badtip))
        }
        if(sum(!cdata$sequence_id %in% tree$tip.label) == 0 &
           sum(!tree$tip.label %in% c(cdata$sequence_id, "Germline"))){
          stop(paste("clone",clone@clone," tree downsampling failed!")) 
        }
      }
    }
  }
  clone@data <- cdata
  results <- list()
  results$clone <- clone
  results$tree <- tree
  results
}

#' Create a bootstrap distribution for clone sequence alignments, and estimate 
#' trees for each bootstrap replicate.
#' 
#' \code{findSwitches} Phylogenetic bootstrap function.
#' @param clones         tibble \code{airrClone} objects, the output of 
#'                      \link{formatClones}
#' @param permutations    number of bootstrap replicates to perform
#' @param trait         trait to use for parsimony models
#' @param igphyml       location of igphyml executible 
#' @param build         program to use for tree building (phangorn, dnapars)
#' @param exec          location of desired phylogenetic executable
#' @param id            unique identifer for this analysis (required if 
#'                      \code{igphyml} or \code{dnapars} specified)
#' @param dir           directory where temporary files will be placed (required
#'                      if \code{igphyml} or \code{dnapars} specified)
#' @param modelfile     file specifying parsimony model to use
#' @param fixtrees      keep tree topologies fixed?
#'                      (bootstrapping will not be perfomed)
#' @param nproc            number of cores to parallelize computations
#' @param quiet           amount of rubbish to print to console
#' @param rm_temp       remove temporary files (default=TRUE)
#' @param palette       deprecated
#' @param resolve       how should polytomies be resolved? 
#'                       0=none, 1=max parsminy, 2=max ambiguity + polytomy skipping,
#'                       3=max ambiguity
#' @param keeptrees     keep trees estimated from bootstrap replicates? (TRUE)
#' @param lfile         lineage file input to igphyml if desired (experimental)
#' @param rep           current bootstrap replicate (experimental)
#' @param seq           column name containing sequence information
#' @param downsample    downsample clones to have a maximum specified tip/switch ratio?
#' @param tip_switch    maximum allowed tip/switch ratio if downsample=TRUE
#' @param boot_part     is  "locus" bootstrap columns for each locus separately
#' @param force_resolve continue even if polytomy resolution fails?
#' @param ...        additional arguments to be passed to tree building program
#'
#' @return   A list of trees and/or switch counts for each bootstrap replicate.
#'
#' @details
#' Tree building details are the same as \link{getTrees}. 
#' If \code{keeptrees=TRUE} (default) the returned object will contain a list 
#' named "trees" which contains a list of estimated tree objects for each 
#' bootstrap replicate. The object is structured like: 
#' trees[[<replicate>]][[<tree index>]]. If \code{igphyml} is specified 
#' (as well as \code{trait}), the returned object 
#' will contain a \code{tibble} named "switches" containing switch count 
#' information. This object can be passed to \link{testSP} and other functions 
#' to perform parsimony based trait value tests. 
#'
#' Trait values cannot contain values N, UCA, or NTIP. These are reserved for
#' use by test statistic functions. 
#' 
#' @seealso Uses output from \link{formatClones} with similar arguments to 
#' \link{getTrees}. Output can be visualized with \link{plotTrees}, and tested
#' with \link{testPS}, \link{testSC}, and \link{testSP}.
#' 
#' @examples
#' \dontrun{
#' data(ExampleAirr)
#' ExampleAirr$sample_id <- sample(ExampleAirr$sample_id)
#' clones <- formatClones(ExampleAirr, trait="sample_id")
#' 
#' igphyml <- "~/apps/igphyml/src/igphyml"
#' btrees <- findSwitches(clones[1:2,], permutations=10, nproc=1,
#'    igphyml=igphyml, trait="sample_id")
#' plotTrees(btrees$trees[[4]])[[1]]
#' testPS(btrees$switches)
#' }
#' @export
findSwitches <- function(clones, permutations, trait, igphyml, 
                         fixtrees=FALSE, downsample=TRUE, tip_switch=20, nproc=1, 
                         dir=NULL, id=NULL, modelfile=NULL, build="pratchet", exec=NULL, 
                         quiet=0, rm_temp=TRUE, palette=NULL, resolve=2, rep=NULL,
                         keeptrees=FALSE, lfile=NULL, seq=NULL,
                         boot_part="locus", force_resolve=FALSE, ...){
  
  if(is.null(exec) && (!build %in% c("pratchet", "pml"))){
    stop("exec must be specified for this build option")
  }
  if(file.access(igphyml, mode=1) == -1) {
    stop("Igphyml executable at ", igphyml, " cannot be executed.")
  }
  if(downsample & !quiet & is.null(rep)){
    print(paste("Downsampling lineages to a maximum tip-to-switch ratio of",tip_switch))
  }
  if(!fixtrees & !quiet & is.null(rep)){
    print("Re-building bootstrapped trees. Use fixtrees=TRUE to use fixed topologies.")
    if(build=="igphyml"){
      stop("Bootstrapping while build=igphyml not yet supported (bootstraps sample nucleotide sites)")
    }
  }else if(is.null(rep)){
    print("Keeping tree topology constant. Use fixtrees=FALSE to bootstrap topologies.")
  }
  if(!is.null(dir)){
    dir <- path.expand(dir)
  }
  
  data <- clones$data
  if(!inherits(data, "list")){
    data <- list(data)
  }
  if(!inherits(data[[1]], "airrClone")){
    stop("Input data must be a list of airrClone objects")
  }
  if(fixtrees){
    if(!"trees" %in% names(clones)){
      stop("trees column must be included in input if fixtrees=TRUE (use getTrees first)")
    }
    if(!inherits(clones$trees[[1]], "phylo")){
      stop("Trees must be a list of class phylo")
    }
    trees <- clones$trees
  }else{
    trees <- NULL
  }
  if(is.null(id)){
    id <- "sample"
  }
  if(!is.null(dir)){
    if(!dir.exists(dir)){
      dir.create(dir)
    }
  }
  big <- FALSE
  if(sum(unlist(lapply(data, function(x)nrow(x@data)))) > 10000){
    big <- TRUE
  }
  if(!rm_temp && big){
    warning("Large dataset - best to set rm_temp=TRUE")
  }
  if(!is.null(igphyml)){
    igphyml <- path.expand(igphyml)
    if(!is.null(dir)){
      if(!dir.exists(dir)){
        dir.create(dir)
      }
    }else{
      dir <- alakazam::makeTempDir(id)
      if(big){
        warning("Large dataset - best to set dir and id params")
      }
    }
    if(is.null(trait)){
      stop("trait must be specified when running igphyml")
    }
    # remove problematic characters from trait values
    data <- lapply(data,function(x){
      x@data[[trait]] <- gsub(":|;|,|=| |_","-",x@data[[trait]])
      x})
        traits <- unique(unlist(lapply(data, function(x)unique(x@data[[trait]]))))
        if("N" %in% traits || "NTIP" %in% traits || "UCA" %in% traits){
            stop(paste("Forbidden trait values (e.g. N, NTIP, UCA) detected.",
                "Please remove or change, then re-run formatClones."))
        }

    if(is.null(modelfile)){
      states <- unique(unlist(lapply(data,function(x)x@data[,trait])))
      modelfile <- makeModelFile(states,file=file.path(dir,paste0(id,"_modelfile.txt")))
    }else{
      states <- readModelFile(modelfile)
    }
    if(sum(is.na(states) > 0)){
      stop("NA trait values detected, must be removed before trait analysis.")
    }
    # if trees are fixed, add trait value to tree tips
    if(fixtrees){
      for(i in 1:length(data)){
        da <- data[[i]]
        tr <- trees[[i]]
        tips <- tr$tip.label[tr$tip.label != "Germline"]
        m <- match(tips, da@data$sequence_id)
        if(sum(is.na(m)) > 0){
          stop(paste("Tips",tips[!tips %in% da@data$sequence_id],
                     "not found in clone object",da@clone))
        }
        tr$tip.label[tr$tip.label != "Germline"] <- 
          paste0(tips, "_", da@data[[trait]][m])
        trees[[i]] <- tr
      }
    }
    #if igphyml is specified, append trait value to sequence ids
    data <- lapply(data,function(x){
      x@data$sequence_id <- paste0(x@data$sequence_id,"_",x@data[[trait]])
      x})
  }
  if(build=="dnapars"){
    if(is.null(dir) || is.null(id)){
      stop("dir, and id parameters must be specified when running dnapars")
    }
  }
    if(is.null(rep)){
        reps <- as.list(1:permutations)
        l <- parallel::mclapply(reps,function(x)
            tryCatch(findSwitches(clones ,rep=x,
            trait=trait, modelfile=modelfile, build=build,
            exec=exec, igphyml=igphyml,
            id=id, dir=dir, permutations=permutations,
            nproc=1, rm_temp=rm_temp, quiet=quiet,
            fixtrees=fixtrees, resolve=resolve, keeptrees=keeptrees,
            lfile=lfile, seq=seq, downsample=downsample, tip_switch=tip_switch,
            boot_part=boot_part, force_resolve=force_resolve, ...),
            error=function(e)e),mc.cores=nproc)
        errors <- unlist(lapply(l, function(x) inherits(x, "error")))
        messages <- l[errors]
        if(sum(errors) > 0){
            me <- lapply(messages, function(x)warning(x$message))
            stop(paste("findSwitches failed for",sum(errors),
                "repetitions (see warnings, more info if nproc=1)"))
  }
    results <- list()
    results$switches <- NULL
    results$trees <- NULL
    if(!is.null(igphyml)){
      results$switches <- dplyr::bind_rows(lapply(l,function(x)x$switches))
    }
    if(keeptrees){
      results$trees <- lapply(l,function(x)x$trees)
    }
    if(rm_temp){
      if(file.exists(file.path(dir,paste0(id,"_modelfile.txt")))){
        unlink(file.path(dir,paste0(id,"_modelfile.txt")))
      }
    }
    return(results)
  }else{
    rm_dir=file.path(dir,paste0(id,"_recon_",rep))
    
    if(downsample){
      if(quiet > 3){print("downsampling clones")}
      rarefied <- lapply(1:length(data), function(x)
        downsampleClone(clone=data[[x]], 
                        tree=trees[[x]], trait=trait,
                        tip_switch=tip_switch))
      data <- lapply(rarefied, function(x)x$clone)
      seqs <- unlist(lapply(data, function(x)nrow(x@data)))
      index <- order(seqs, decreasing=TRUE)
      data <- data[index]
      if(fixtrees){
        trees <- lapply(rarefied, function(x)x$tree)
        trees <- trees[index]
      }
    }
    if(!fixtrees){
      temp_clones <- dplyr::tibble(data=data, clone_id = unlist(lapply(data, 
                            function(x)x@clone)), seqs = unlist(lapply(data,function(x)nrow(x@data))))
      #temp_clones <- getTrees(clones = temp_clones, build = build, nproc = nproc)
      clones_with_trees <- getBootstraps(clones = temp_clones, bootstraps = 1, nproc = nproc, 
                                         dir = dir, id = id, build = build, exec = exec,
                                         quiet = quiet, rm_temp = rm_temp, seq = seq,
                                         boot_part = boot_part, bootstrap_nodes = FALSE,
                                         switches = TRUE, ...)
      trees <- lapply(clones_with_trees$bootstrap_trees, function(x)x[[1]])
    }
    
    match <- unlist(lapply(1:length(data), function(x){
      data[[x]]@clone == trees[[x]]$name
    }))
    if(sum(!match) > 0){
      stop("Clone and tree names not in proper order!")
    }
    
    results <- list()
    if(!is.null(igphyml)){
      if(is.null(lfile)){    
        file <- writeLineageFile(data=data, trees=trees, dir=dir,
                                 id=id, trait=trait, rep=rep)
      }else{
        file <- lfile
      }
      rseed <- floor(stats::runif(1,0,10^7))+rep
      switches <- reconIgPhyML(file, modelfile, igphyml=igphyml, 
                               mode="switches", type="recon", id=rep, 
                               quiet=quiet, rm_files=FALSE, rm_dir=NULL, nproc=nproc,
                               resolve=resolve, rseed=rseed, force_resolve=force_resolve)
      permuted <- reconIgPhyML(file, modelfile, igphyml=igphyml, 
                               mode="switches", type="permute", id=rep, 
                               quiet=quiet, rm_files=FALSE, rm_dir=NULL, nproc=nproc,
                               resolve=resolve, rseed=rseed, force_resolve=force_resolve)
      if(!rm_temp){rm_dir=NULL}
      permuteAll <- reconIgPhyML(file, modelfile, igphyml=igphyml, 
                                 mode="switches", type="permuteAll", id=rep, 
                                 quiet=quiet, rm_files=rm_temp, rm_dir=rm_dir, nproc=nproc,
                                 resolve=resolve, rseed=rseed, force_resolve=force_resolve)
      switches <- rbind(switches,permuted)
      switches <- rbind(switches,permuteAll)
      switches$ID <- id
      #REP and CLONE are swapped up in the lower functions, this corrects them
      temp <- switches$REP
      switches$REP <- switches$CLONE
      switches$CLONE <- temp
      clone_index <- unlist(lapply(data,function(x)x@clone))
      switches$CLONE <- clone_index[switches$CLONE+1]
      results$switches <- switches
      
      # remove trait value from tips
      trees <- lapply(trees,function(x){
        ids <- strsplit(x$tip.label,split="_")
        x$tip.label <- unlist(lapply(ids,function(t)
          paste(t[1:(length(t)-1)],collapse="_")))
        x$tip.label[x$tip.label == x$name] <- "Germline"
        x
      })
    }
    if(keeptrees){
      results$trees <- trees
    }
    return(results)
  }
}

#' Deprecated! Please use findSwitches instead.
#' 
#' \code{bootstrapTrees} Phylogenetic bootstrap function.
#' @param clones         tibble \code{airrClone} objects, the output of 
#'                      \link{formatClones}
#' @param bootstraps    number of bootstrap replicates to perform
#' @param trait            trait to use for parsimony models (required if 
#'                      \code{igphyml} specified)
#' @param build           program to use for tree building (phangorn, dnapars)
#' @param exec           location of desired phylogenetic executable
#' @param igphyml        location of igphyml executible if trait models desired
#' @param id            unique identifer for this analysis (required if 
#'                      \code{igphyml} or \code{dnapars} specified)
#' @param dir           directory where temporary files will be placed (required
#'                      if \code{igphyml} or \code{dnapars} specified)
#' @param modelfile        file specifying parsimony model to use
#' @param fixtrees        keep tree topologies fixed?
#'                      (bootstrapping will not be perfomed)
#' @param nproc            number of cores to parallelize computations
#' @param quiet           amount of rubbish to print to console
#' @param rm_temp       remove temporary files (default=TRUE)
#' @param palette        deprecated
#' @param resolve        how should polytomies be resolved? 
#'                       0=none, 1=max parsminy, 2=max ambiguity + polytomy skipping,
#'                       3=max ambiguity
#' @param keeptrees     keep trees estimated from bootstrap replicates? (TRUE)
#' @param lfile         lineage file input to igphyml if desired (experimental)
#' @param rep             current bootstrap replicate (experimental)
#' @param seq           column name containing sequence information
#' @param downsample    downsample clones to have a maximum specified tip/switch ratio?
#' @param tip_switch    maximum allowed tip/switch ratio if downsample=TRUE
#' @param boot_part     is  "locus" bootstrap columns for each locus separately
#' @param force_resolve continue even if polytomy resolution fails?
#' @param ...        additional arguments to be passed to tree building program
#'
#' @return   A list of trees and/or switch counts for each bootstrap replicate.
#'  
#' @export
bootstrapTrees <- function(clones, bootstraps, nproc=1, trait=NULL, dir=NULL, 
                           id=NULL, modelfile=NULL, build="pratchet", exec=NULL, igphyml=NULL, 
                           fixtrees=FALSE,    quiet=0, rm_temp=TRUE, palette=NULL, resolve=2, rep=NULL,
                           keeptrees=TRUE, lfile=NULL, seq=NULL, downsample=FALSE, tip_switch=20,
                           boot_part="locus", force_resolve=FALSE,...){
  
  warning("boostrapTrees is depracated. Use findSwitches instead.")
  
  s = findSwitches(clones ,rep=rep, 
                   trait=trait, modelfile=modelfile, build=build, 
                   exec=exec, igphyml=igphyml, 
                   id=id, dir=dir, permutations=bootstraps,
                   nproc=1, rm_temp=rm_temp, quiet=quiet,
                   fixtrees=fixtrees, resolve=resolve, keeptrees=keeptrees,
                   lfile=lfile, seq=seq, downsample=downsample, tip_switch=tip_switch,
                   boot_part=boot_part, force_resolve=force_resolve, ...)
  return(s)
}


#' Get the tip labels as part of a clade defined by an internal node
#' 
#' \code{getSubTaxa} Gets the tip labels from a clade
#' @param  node    node number that defines the target clade
#' @param  tree    \code{phylo} object
#'
#' @return   A vector containing tip labels of the clade
#' @examples
#' # Get taxa from all subtrees
#' data(BiopsyTrees)
#' tree <- BiopsyTrees$trees[[8]]
#' all_subtrees <- lapply(1:length(tree$nodes), function(x)getSubTaxa(x, tree))
#' 
#' @export
getSubTaxa = function(node, tree){
  if(node > length(tree$tip.label) + tree$Nnode){
    stop(paste("Node", node, "too large for tree"))
  }
  children <- tree$edge[tree$edge[,1] == node, 2]
  if(length(children) == 0){
    if(node > length(tree$tip.label)){
      stop(paste("Malformed base case at node", node))
    }
    return(tree$tip.label[node])
  }
  tips <- unlist(lapply(children, function(x)getSubTaxa(x, tree)))
  return(tips)
}

# KEN: Try to keep lines to 80 characters or less, which is here -------------->|

# Turn your tree data into a nodes based dataframe
# 
# \code{lones} Filler
# @param    input_tree     \code{phylo} object
# @param    bootstrap_number      Which bootstrap replicate to use. With only 
# @param                          one tree, this has to be one. 
#
# @return   A dataframe that lists out the value of found or absent tips for each 
#           node within a tree. This is done for each tree inputted.
# bootstrapped sequences
splits_func <- function(input_tree, bootstrap_number){
  tree <- input_tree[[bootstrap_number]] 
  splits <- data.frame(found=I(lapply(
    (length(tree$tip.label) + 1):(length(tree$tip.label) +  
                                    dplyr::n_distinct(tree$edge[,1])),
    function(x)getSubTaxa(x, tree))))
  splits$node <- (length(tree$tip.label) + 1):(length(tree$tip.label) +
                                              dplyr::n_distinct(tree$edge[,1]))
  # find the difference between tip labels and the tips in 'found'
  full_tips <- tree$tip.label 
  absent <- (lapply(1:length(splits$found), 
                    function(x)dplyr::setdiff(full_tips, splits$found[[x]])))
  splits <- cbind(splits, data.frame(absent=I(absent)))
  splits$tree_num <- bootstrap_number
  # reorder it to make sense -- tree number, node number, found tips, and absent
  splits <- splits[, c(4, 2, 1, 3)]
  return(splits)
}

# Match the tips found in the various nodes of two different trees. 
# 
# \code{lones} Filler
# @param    tree_comp_df     The tree the bootstrap tree nodes should be 
#                            compared to. This needs to already be a dataframe 
#                            made from splits_func.
# @param    bootstrap_df     The dataframe made by the splits_func that has the 
#                            found and absent tips in the bootstrap trees.
# @param    nproc            The number of processors to be used
#
# @return   Returns a vector with the number of matches. 
matching_function_parallel <- function(tree_comp_df, bootstrap_df, nproc){
  match_vector <- c()
  match_vector = parallel::mclapply(unique(tree_comp_df$node), function(node){
    # KEN: There must be a more efficient way of doing this but I can't think of 
    # one right now
    sub_full_tree_df <- tree_comp_df[tree_comp_df$node==node,]
    matches = unlist(lapply(1:length(bootstrap_df$tree_num), function(x){
      match_test1 <- setequal(bootstrap_df$found[[x]], sub_full_tree_df$found[[1]])
      match_test2 <- setequal(bootstrap_df$found[[x]], sub_full_tree_df$absent[[1]])
      if(match_test1 || match_test2){
        return(1)
      }else{
        return(0)
      }}))
    dplyr::tibble(nodes = node, matches = sum(matches))
  },mc.cores=nproc)
  match_vector <- dplyr::bind_rows(match_vector)
  return(match_vector)
}


# KEN: Output very suspicious, this may be one we need to do ourselves..
# Build a bootstrap consensus tree using list of bootstrapped trees
# 
# \code{lones} Filler
# @param    trees     List of trees to use for tree building
#
# @return   Returns a consensus tree. 
consensus_tree <- function(trees){
  consensus <- ape::consensus(trees, check.labels=TRUE)
  consensus$edge.length <- rep(1, nrow(consensus$edge))
  consensus <- rerootTree(consensus, germline="Germline", verbose=0)
  return(consensus)
}


# Build some trees based on the clones object's $data and other parameters.
# This results in a list of trees. 
#
# \code{lones} Filler
# @param data          an AIRRE clone's data object (e.g. clones$data)
# @param seq           column name containing sequence information
# @param build         program to use for tree building (phangorn, dnapars, igphyml)
# @param boot_part     is  "locus" bootstrap columns for each locus separately
# @param exec          location of desired phylogenetic executable
# @param dir           directory where temporary files will be placed (required
#                      if \code{igphyml} or \code{dnapars} specified)
# @param rm_temp       remove temporary files (default=TRUE)
# @param rm_dir        remove temporary directory (default=TRUE)
# @param id            unique identifier for this analysis (required if 
#                      \code{igphyml} or \code{dnapars} specified)
# @param quiet           amount of rubbish to print to console
# @param rep             current bootstrap replicate (experimental)
# @param by_codon      Bootstrap by codon 
# @param starting_tree A starting tree 
# @return              Returns a list of trees.
makeTrees <- function(clones, seq, build, boot_part, exec, dir, rm_temp=TRUE, id, 
                      quiet=0, rep=1, by_codon = TRUE, starting_tree=NULL, ...){
  if(quiet > 0){
    print(paste0("Making trees for rep ", rep))
  }
  #KBH in future iterations we'll want to make the boostrapping optional, but it's okay for now
  data <- clones$data
  # randomize the data_tmp file -- shuffle the rows CGJ 4/13/23
  for(i in 1:length(data)){
    data[[i]]@data <- data[[i]]@data[sample(1:nrow(data[[i]]@data), replace = FALSE),]
  }
  data_tmp <- list()
  for(i in 1:length(data)){
    data_tmp[[i]] <- bootstrapClones(data[[i]], reps = 1, 
                                     partition = boot_part, by_codon = by_codon)[[1]]
  }
  reps <- as.list(1:length(data_tmp))
  if(is.null(seq)){
    seqs <- unlist(lapply(data_tmp,function(x)x@phylo_seq))
  }else{
    seqs <- rep(seq,length=length(data_tmp))
  }
  if(!is.null(starting_tree)){
    from_getTrees <- TRUE
  } else{
    from_getTrees <- FALSE
  }
  if(build=="pratchet"){
    trees <- lapply(reps,function(x)
      tryCatch(buildPratchet(data_tmp[[x]],seq=seqs[x],...),error=function(e)e))
    trees <- list(trees)
  } else if(build=="dnapars" || build=="dnaml"){
    trees <- lapply(reps,function(x)
      tryCatch(buildPhylo(data_tmp[[x]],
                          exec=exec,
                          temp_path = file.path(dir,paste0(id,"_", rep, "_trees_",x)),
                          rm_temp = rm_temp,
                          seq=seqs[x], ...), error=function(e)e))
    trees <- list(trees)
  }else if(build=="pml"){
    trees <- lapply(reps,function(x)tryCatch(buildPML(data_tmp[[x]],seq=seqs[x],
                                quiet=quiet, rep=rep,...), error=function(e)e))
    
    trees <- list(trees)
  } else if(build=="raxml"){ # CGJ 2/20/23
    trees <- lapply(reps,function(x)tryCatch(buildRAxML(data_tmp[[x]],
                          seq=seqs[x],
                          exec = exec,
                          trees = starting_tree[[x]],
                          rm_files = rm_temp,
                          rep = rep,
                          dir = dir,
                          rseed = x,
                          from_getTrees=from_getTrees,...),error=function(e)e))
  } else if(build=="igphyml"){
    if(rm_temp){
      rm_dir <- file.path(dir,paste0(id,rep))
    }else{
      rm_dir <- NULL
    }
    trees <- 
      tryCatch(buildIgphyml(data_tmp, 
                            igphyml = exec,
                            temp_path = file.path(dir,paste0(id,rep)),
                            rm_files=rm_temp,
                            rm_dir = rm_dir,
                            id=id, ...), error=function(e)e)
    trees <- list(trees)
    if(inherits(trees, "error")){
      stop(trees)
    }
    
  }else{
    stop("build specification ",build," not recognized")
  } 
  
  if(build != "igphyml"){
    errors <- unlist(lapply(trees, function(x) inherits(x, "error")))
    messages <- trees[errors]
    errorclones <- clones$clone_id[errors]
    trees <- trees[!errors]
    if(length(errorclones) > 0){
      warning(paste("Tree building failed for clones",
                    paste(errorclones,collapse=", ")))
      me <- lapply(messages, function(x)warning(x$message))
    }
  }
  return(trees)
}


#' Creates a bootstrap distribution for clone sequence alignments, and returns  
#' estimated trees for each bootstrap replicate as a nested list as a new input 
#' tibble column.
#' 
#' \code{getBootstraps} Phylogenetic bootstrap function.
#' @param clones         tibble \code{airrClone} objects, the output of 
#'                      \link{formatClones}
#' @param bootstraps    number of bootstrap replicates to perform
#' @param nproc            number of cores to parallelize computations
#' @param bootstrap_nodes a logical if the the nodes for each tree in the trees
#'                        column (required) should report their bootstrap value
#' @param dir           directory where temporary files will be placed (required
#'                      if \code{igphyml} or \code{dnapars} specified)
#' @param id            unique identifer for this analysis (required if 
#'                      \code{igphyml} or \code{dnapars} specified)
#' @param build           program to use for tree building (phangorn, dnapars, igphyml)
#' @param exec           location of desired phylogenetic executable
#' @param quiet           amount of rubbish to print to console
#' @param rm_temp       remove temporary files (default=TRUE)
#' @param rep             current bootstrap replicate (experimental)
#' @param seq           column name containing sequence information
#' @param boot_part     is  "locus" bootstrap columns for each locus separately
#' @param by_codon      a logical if the user wants to bootstrap by codon or by 
#'                      nucleotide. Default (codon based bootstrapping) is TRUE.
#' @param starting_tree An indicator to use the existing trees column as the starting trees for RAxML
#' @param switches   a logical indicator to allow findSwitches to do permutations. 
#' @param ...        additional arguments to be passed to tree building program
#'
#' @return   The input clones tibble with an additional column for the bootstrap replicate trees.
#'  
#' @export
getBootstraps <- function(clones, bootstraps,
                          nproc=1, bootstrap_nodes=TRUE, dir=NULL, id=NULL, build="pratchet", 
                          exec=NULL, quiet=0, rm_temp=TRUE, rep=NULL, seq=NULL,
                          boot_part="locus", by_codon = TRUE, starting_tree=FALSE,
                          switches=FALSE, ...){
  if(is.null(exec) && (!build %in% c("pratchet", "pml"))){
    stop("exec must be specified for this build option")
  }
  if(build=="igphyml" && file.access(exec, mode=1) == -1) {
    stop("Igphyml executable at ", exec, " cannot be executed.")
  }
  if(!is.null(dir)){
    dir <- path.expand(dir)
  }
  
  data <- clones$data
  if(is.null(id)){
    id <- "sample"
  }
  if(!inherits(data,"list")){
    data <- list(data)
  }
  if(!is.null(dir)){
    if(!dir.exists(dir)){
      dir.create(dir)
    }
  }
  if(!inherits(data[[1]], "airrClone")){
    stop("Input data must be a list of airrClone objects")
  }
  big <- FALSE
  if(sum(unlist(lapply(data, function(x)nrow(x@data)))) > 10000){
    big <- TRUE
  }
  if(!rm_temp && big){
    warning("Large dataset - best to set rm_temp=TRUE")
  }
  if(build == "igphyml"){
    # CGJ 9/5/23 checkUsagePackage("dowser") found it
    # igphyml <- path.expand(exec)
    if(!is.null(dir)){
      if(!dir.exists(dir)){
        dir.create(dir)
      }
    } else{
      dir <- alakazam::makeTempDir(id)
      if(big){
        warning("Large dataset - best to set dir and id params")
      }
    }
  }
  if(build=="dnapars" || build=="dnaml"){
    exec <- path.expand(exec)
    if(!is.null(dir)){
      if(!dir.exists(dir)){
        dir.create(dir)
      }
    } else{
      dir <- alakazam::makeTempDir(id)
      if(big){
        warning("Large dataset - best to set dir and id params")
      }
    }
  }
  if(!switches){
    if(bootstrap_nodes){
      if(!"trees" %in% colnames(clones)){
        stop("A trees column created by using getTrees() is required if 
           bootstrap_nodes=TRUE")
      }
    }
    build_used <- gsub("phangorn::", "", clones$trees[[1]]$tree_method)
    build_used <- gsub("phylip::", "", build_used)
    build_used <- gsub("\\:.*", "", build_used)
    build_used <- gsub("optim.", "", build_used)
    if(grepl("igphyml", build_used)){
      build_used <- "igphyml"
    }
    if(grepl("RAxML", build_used)){
      build_used <- "raxml"
    }
    if(build_used == "prachet"){
      build_used <- "pratchet"
    }
    if(build != build_used){
      stop(paste0("Trees and bootstrapped trees need to be made using the same method.",
                  " Use the same build option in getTrees as getBootstraps.",
                  " getBootstraps is trying to use a ", build, 
                  " build, but getTrees used ", build_used, " to build trees."))
    }
    # CGJ 2/20/23
    if(starting_tree){
      if("trees" %in% colnames(clones)){
        starting_tree <- clones$trees
      } else{
        stop("starting_trees cannot be set as TRUE without an already made trees",
             " column. Use getTrees() to get the trees column.")
      }
    } else{
      starting_tree <- NULL
    }
  }
  if(build != "igphyml" | build != "raxml"){
    bootstrap_trees <- unlist(parallel::mclapply(1:bootstraps, function(x)
      tryCatch(makeTrees(clones=clones, seq=seq, build=build, boot_part=boot_part,
                         exec=exec, dir=dir, rm_temp=rm_temp, id=id, quiet=quiet, 
                         rep=x, by_codon = by_codon), 
               error=function(e)e), mc.cores=nproc), recursive = FALSE)
  }else if(build == "igphyml" | build == "raxml"){
    bootstrap_trees <- unlist(parallel::mclapply(1:bootstraps, function(x)
      tryCatch(makeTrees(clones=clones, seq=seq[x], build=build, boot_part=boot_part,
                         exec=exec, dir=dir, rm_temp=rm_temp, id=id, quiet=quiet, 
                         rep=x, by_codon = by_codon, starting_tree = starting_tree), 
               error=function(e)e), mc.cores=nproc), recursive = FALSE)
  }
  clones$bootstrap_trees <- lapply(1:nrow(clones), function(x)list())
  # make raxml format friendly CGJ 2/20/23
  if(build == "raxml"){
    new_boots <- c()
    for(i in 1:bootstraps){
      values <- 1:nrow(clones)*i
      if(i == 1){
        tobind <- bootstrap_trees[min(values):max(values)]
      } else{
        tobind <- bootstrap_trees[((max(values)/min(values))+1):max(values)]
      }
      new_boots <- append(new_boots, list(tobind))
    }
    bootstrap_trees <- new_boots
  }
  for(i in 1:length(clones$clone_id)){
    clones$bootstrap_trees[[i]] <- lapply(bootstrap_trees, function(x)x[[i]])
  }
  errors <- c()
  messages <- c()
  for(clone in unique(clones$clone_id)){
    if(quiet > 0){
      print(clone)
    }
    sub_clone <- dplyr::filter(clones, !!rlang::sym("clone_id") == clone)
    tree_error <- c()
    tree_message <- c()
    for(i in 1:bootstraps){
      if(inherits(sub_clone$bootstrap_trees[[1]][[i]], "error")){
        tree_error <- append(tree_error, clone)
        tree_message <- append(tree_message, sub_clone$bootstrap_trees[[1]][[i]]$message)
      }
    }
    errors <- append(errors, tree_error)
    messages <- append(messages, tree_message)
  }
  
  for(clone in unique(errors)){
    idx <- which(errors == clone)
    wow <- messages[idx]
    warning(paste0("Clone ", clone, " has been REMOVED from the clones object ",
                   "because it failed to properly bootstrap due in at least one iteration of ",
                   "bootstrapping due to ", unique(wow), "."))
  }
  clones <- dplyr::filter(clones, !(!!rlang::sym("clone_id") %in% unique(errors)))
  if(!bootstrap_nodes){
    return(clones)
  } else{
    for(clone in 1:length(clones$clone_id)) {
      if(quiet >0){
        print(paste0("bootstrapping clone ", clones$clone_id[clone]))
      }
      b_trees <- clones$bootstrap_trees[[clone]] # KEN: safer if clones is empty
      tree_comp_df <- splits_func(list(clones$trees[[clone]]), 1)
      bootstraps_df <- lapply(1:bootstraps, function(x)splits_func(b_trees,x))
      bootstraps_df <- do.call(rbind, bootstraps_df)
      matches_df <- matching_function_parallel(tree_comp_df, bootstraps_df, nproc)
      for(node in 1:max(matches_df$nodes)){
        if(node %in% setdiff(clones$trees[[clone]]$edge[,2], clones$trees[[clone]]$edge[,1])){
          clones$trees[[clone]]$nodes[[node]]$bootstrap_value <- NA
        } else{
          clones$trees[[clone]]$nodes[[node]]$bootstrap_value <- 
            dplyr::filter(matches_df, !!rlang::sym("nodes") == node)$matches
        }
      }
      if(quiet >0){
        print(paste0("Clone ", clones$clone_id[clone], " bootstrapping completed"))
      }
    }
  }
  return(clones)
}

Try the dowser package in your browser

Any scripts or data that you put into this service are public.

dowser documentation built on May 29, 2024, 3:06 a.m.