R/snp.sim.R

Defines functions .get.locus snp.sim

Documented in snp.sim

#############
## snp.sim ##
#############

########################################################################

###################
## DOCUMENTATION ##
###################

#' Short one-phrase description.
#'
#' Longer proper discription of function...
#'
#' @param n.snps An integer specifying the number of genetic loci to be simulated.
#'
#' @author Caitlin Collins \email{caitiecollins@@gmail.com}
#'
#'
#' @examples
#' ## Example ##
#'
#' @import adegenet
#' @rawNamespace import(ape, except = zoom)
#' @importFrom Hmisc all.is.numeric
#' @importFrom phangorn midpoint
#'
#' @export

########################################################################
# @useDynLib phangorn, .registration = TRUE

## ARGUMENTS: ##

## n.subs <- either an integer or a vector containing a distribution of n.subs-per-site
## phen.loci <- a vector containing the indices of the edges on which phen subs occurred


snp.sim <- function(n.snps = 10000,
                    n.subs = 1,
                    snp.root = NULL,
                    n.snps.assoc = 0,
                    assoc.prob = 100,
                    tree = coalescent.tree.sim(100),
                    phen.loci = NULL,
                    heatmap = FALSE,
                    reconstruct = FALSE,
                    dist.dna.model = "JC69",
                    row.names = NULL,
                    set = NULL,
                    seed = 1){

  # require(adegenet)
  # require(ape)

  ## Allow assoc.prob to be in percent or
  ## as a proportion (-> eg 80 or 90, ie out of 100):
  if(!is.null(assoc.prob)){
    if(assoc.prob[1] >= 0 & assoc.prob <= 1){
      assoc.prob <- assoc.prob*100
    }
  }

  ##################
  ## HANDLE TREE: ##
  ##################
  ## Always work with trees in "pruningwise" order:
  tree <- reorder.phylo(tree, order="pruningwise")
  ## Trees must be rooted:
  # if(!is.rooted(tree)) tree <- midpoint(tree)

  ####################################################################
  ############################
  ## Get Anc-Des EDGE ORDER ##
  ############################
  ## Get sequence from lowest ("root", Nterm+1) to highest ancestral node:
  ix <- c(min(tree$edge[,1]):max(tree$edge[,1]))
  ## Get for loop index of rows in tree$edge[,1], in pairs (if binary tree), from lowest to highest:
  x <- as.vector(unlist(sapply(c(1:length(ix)), function(e) which(tree$edge[,1] == ix[e]))))
  ####################################################################


  ##################################
  ## GET MUTATIONS' branch & loci ##
  ##################################
  # if(!is.null(tree$tip.label)){
  #   n.ind <- length(tree$tip.label)
  # }else{
  n.ind <- min(tree$edge[,1])-1 # tree$Nnode+1
  #}
  gen.size <- n.snps
  edges <- tree$edge

  if(!is.null(seed)) set.seed(seed)

  ## Simulate genotype for root individual: ##

  ## For n.subs = n or = dist approaches:
  gen.root <- sample(c(TRUE, FALSE), gen.size, replace=TRUE)

  ## get the sum of all branch lengths in the tree:
  time.total <- sum(tree$edge.length)

  ## make dummy variables in which to store the resulting n.mts variables:
  L <- lambda <- n.mts <- new.nt <- NA

  snps.assoc <- NULL

  ## if n.snps.assoc is neither NULL nor 0:
  if(is.null(n.snps.assoc)) n.snps.assoc <- 0
  if(n.snps.assoc != 0){

    ## get non.assoc gen.size
    gen.size.ori <- gen.size
    gen.size <- gen.size-n.snps.assoc

    ## assign snps.assoc to be the last n.snps.assoc snps columns
    snps.assoc <- c((gen.size+1):(gen.size+n.snps.assoc))
  }

  ###################
  ## Handle n.subs ##
  ###################

  ## Either an integer
  ## --> draw n.subs from a Poisson distribution w parameter n.subs
  ## OR a vector (containing a distribution)
  ## --> use this distribution to define n.subs-per-site

  if(length(n.subs)==1 & is.null(names(n.subs))){

    #####################
    ## NO DISTRIBUTION ##
    #####################
    ## if no distribution is inputted,
    ## use normal simulation procedure
    ## (ie. Poisson parameter 1):

    warning("Using n.subs as Poisson parameter because input n.subs was of length 1 and had no names.")

    ## draw the number of mutations to occur at each site:
    if(!is.null(seed)) set.seed(seed)
    n.mts <- rpois(n=gen.size, lambda=(n.subs))
    ## for any n.mts==0, re-sample
    if(any(n.mts == 0)){
      ## need to change seed or we'll get trapped in the while loop
      if(!is.null(seed)) seed.i <- seed
      for(i in 1:length(n.mts)){
        while(n.mts[i]==0){
          if(!is.null(seed)){
            seed.i <- seed.i+1
            set.seed(seed.i)
          }
          n.mts[i] <- rpois(n=1, lambda=(n.subs))
        }
      }
    }

    }else{

      ###############################################
      ## DISTRIBUTION or RATES (fitPagel Q matrix) ##
      ###############################################

      ##################
      ## DISTRIBUTION ##
      ##################

      ## if a distribution is provided by the user,
      ## we use this to determine the number of substitutions
      ## to occur at what proportion of sites (note that
      ## we may not be simulating the same number of sites)

      dist <- n.subs

      ## check for names first!
      if(!is.null(names(dist))){
        ## only modify if names are numeric
        if(all.is.numeric(names(dist))){
          noms <- as.numeric(names(dist))
          aligned <- sapply(c(1:length(dist)), function(e) noms[e] == e)
          ## if any names do not correspond to their index, add zeros where missing:
          if(any(aligned == FALSE)){
            dist.new <- rep(0, max(noms))
            dist.new[noms] <- dist
            names(dist.new) <- c(1:length(dist.new))
            dist <- dist.new
          }
        }
      } # end check for missing places

      ## get dist.prop, a distribution containing the counts
      ## of the number of SNPs to be simulated that will have
      ## i many substitutions
      dist.sum <- sum(dist)
      dist.prop <- round((dist/dist.sum)*gen.size)
      ## check that these counts sum to gen.size,
      ## else add the remainder to the largest n.subs count
      ## (OR should we just add these to the n.subs=1 set ??? ###
      ## likely to be the same thing, but could not be...)
      if(sum(dist.prop) != gen.size){
        m <- which.max(dist.prop)
        #m <- 1
        if(sum(dist.prop) < gen.size){
          dist.prop[m] <- dist.prop[m] + (gen.size - sum(dist.prop))
        }
        if(sum(dist.prop) > gen.size){
          dist.prop[m] <- dist.prop[m] - (sum(dist.prop) - gen.size)
        }
      }

      ## get rid of useless trailing 0s
      while(dist.prop[length(dist.prop)] == 0){
        dist.prop <- dist.prop[c(1:(length(dist.prop)-1))]
      }

      ## make n.mts, a vector of length ncol(snps)
      n.mts <- rep(1111, gen.size)
      loci.available <- c(1:gen.size)
      ## assign dist.prop[i] elements of n.mts
      ## to be the same as the n.subs
      ## indicated by i, the given element of dist.prop
      for(j in 1:length(dist.prop)){
        ## provided there are not 0 sites to have this number of substitutions...
        if(dist.prop[j] > 0){
          if(length(loci.available) > 1){
            if(!is.null(seed)) set.seed(seed)
            ## assign dist.prop[i] elements of n.mts to be i
            loci.selected <- sample(loci.available, dist.prop[j], replace = FALSE)
            loci.available <- loci.available[-which(loci.available %in% loci.selected)]
          }else{
            ## if there is only 1 (the last) loci available,
            ## we select this one:
            loci.selected <- loci.available
          }
          n.mts[loci.selected] <- j
        }
      }
      ## Remove unnecessary objects...
      rm(dist)
      rm(dist.prop)
      rm(dist.sum)
      # } # end dist
  } # end fitPagel or dist

  ############################
  ## Assign mts to branches ##
  ############################

  if(n.snps.assoc != 0){
    if(length(phen.loci) > 0){
      ## for snps.assoc (the last n.snps.assoc snps, for now),
      ## add n.mts == n.phen.loci s.t these sites mutate at each
      ## and every phen.loci (for now, to be diluted later
      ## according to assoc.prob if !=100)
      n.mts <- c(n.mts, rep(length(phen.loci), n.snps.assoc))
    }else{
      stop("No phen.loci specified, so no associated loci can be generated.")
    }
  }

  ####   ###   ###   ###   ###   ###   ###   ###   ###   ###   ###   ###   ####
  #############################################################################
  ## GENERATE ALL SNPs FIRST, THEN REPLACE ANY NON-POLYMORPHIC IN WHILE LOOP ##
  #############################################################################

  #############################
  ## GET NON-ASSOCIATED SNPS ##
  #############################

  ## for each site, draw the branches to which
  ## you will assign the mts for this site
  ## (~ branch length):

  l.edge <- length(tree$edge.length)
  ## Get vector of FALSEs of length tree$edge.length:
  null.vect <- rep(FALSE, l.edge)


  ## TO DO: Memory inefficient step... Improve if possible?
  ## NB: this condition should never be met within treeWAS | parsimonious homoplasy dist:
  if(max(n.mts) > l.edge){
    if(!is.null(seed)) set.seed(seed)
    snps.loci <- NULL
      for(e in 1:length(n.mts)){
        snps.loci <- list()
        repTF <- FALSE
        if(n.mts[e] > l.edge) repTF <- TRUE
        snps.loci[[e]] <- replace(null.vect, sample(c(1:l.edge),
                                                    n.mts[e],
                                                    replace=repTF,
                                                    prob=tree$edge.length), TRUE)

        snps.loci <- t(do.call(rbind, snps.loci))
      }

  }else{
    if(!is.null(seed)) set.seed(seed)
    snps.loci <- sapply(c(1:length(n.mts)),
                        function(e)
                          replace(null.vect,
                          sample(c(1:l.edge),
                                 n.mts[e],
                                 replace=FALSE,
                                 prob=tree$edge.length), TRUE))
  }

  ## rearrange snps.loci s.t it becomes a
  ## list of length tree$edge.length,
  ## each element of which contains the
  ## locations of the mutations that will
  ## occur on that branch
  snps.loci <- sapply(c(1:nrow(snps.loci)),
                       function(e) which(snps.loci[e,] == TRUE))


  ## get the node names for all individuals (terminal and internal)
  all.inds <- sort(unique(as.vector(unlist(tree$edge))))
  # we will store the output in a list called snps:
  snps <- list()
  ## we start w all inds having same genotype as root:
  snps[all.inds] <- rep(list(gen.root), length(all.inds))

  ## store replacement nts in list new.nts:
  new.nts <- list()
  ## distinguish btw list of loci and unique list
  snps.loci.ori <- snps.loci
  ## will need to treat repeat loci differently...
  snps.loci.unique <- lapply(snps.loci, unique)


  #############################
  ## For Loop to get new nts ##
  #############################
  for(i in x){
    ## for all snps other than root, we mutate the
    ## genome of the node preceding it, according to snps.loci.
    ## Draw new nts for each locus selected for mutation:
    if(!.is.integer0(snps.loci.unique[[i]])){
      new.nts[[i]] <- !snps[[tree$edge[i,1]]][snps.loci.unique[[i]]]


      ## if any loci are selected for multiple mutations
      ## within their given branch length:
      if(length(snps.loci.ori[[i]]) != length(snps.loci.unique[[i]])){
        ## identify which loci are repeaters
        repeats <- table(snps.loci.ori[[i]])[which(table(snps.loci.ori[[i]])!=1)]
        ## how many times they repeat
        n.reps <- repeats - 1
        ## the positions of these loci in the vector of snps loci
        toRepeat <- which(snps.loci.unique[[i]] %in% names(repeats))
        ## run chain of re-sampling to end in our new nt for repeater loci:
        foo <- list()
        for(j in 1:length(toRepeat)){
          foo[[j]] <- new.nts[[i]][toRepeat[j]]
          for(k in 1:n.reps[j]){
            if(k==1){
              foo[[j]][k] <- !foo[[j]][1]

            }else{
              foo[[j]][k] <- !foo[[j]][k-1]
            }
          }
          ## retain only the last nt selected
          out <- sapply(c(1:length(foo)),
                        function(e) foo[[e]][length(foo[[e]])])
        }
        ## for the loci with repeated mts, replace these positions
        ## in new.nts with the corresponding elements of out, above.
        new.nts[[i]][toRepeat] <- out
      } # end of if statement for repeaters

      ## update ancestral genotype with new.nts:
      temp <- snps[[tree$edge[i,1]]]
      temp[snps.loci.unique[[i]]] <- new.nts[[i]]
      snps[[tree$edge[i,2]]] <- temp

    }else{
      ## if no mts occur on branch, set genotype of
      ## downstream individual to be equal to ancestor's
      snps[[tree$edge[i,2]]] <- snps[[tree$edge[i,1]]]
    }
  } # end of for loop selecting new nts at mutator loci

  ####################################################
  ## CHECK IF ALL LOCI ARE POLYMORPHIC (|polyThres) ##
  ####################################################

  ## temporarily assemble non-associated loci into matrix:
  # temp.ori <- do.call("rbind", snps)
  temp <- do.call("rbind", snps)

  ## keep only rows containing terminal individuals:
  # temp.ori <- temp.ori[1:n.ind, ]
  temp <- temp[1:n.ind, ]

  ######################################################################################################################################################################

  ## identify n.minor.allele required to meet polyThres:
  polyThres <- 0.01
  n.min <- n.ind*polyThres

  ## make a list of any NON-polymorphic loci:
  csum <- colSums(temp)
  toRepeat <- which(csum < n.min | csum > (nrow(temp) - n.min))



  ####   ###   ###   ###   ###   ###   ###   ###   ###   ###   ####
  #################################################################
  ## REPLACE ANY NON-POLYMORPHIC LOCI & GENERATE ASSOCIATED SNPS ##
  #################################################################
  ####   ###   ###   ###   ###   ###   ###   ###   ###   ###   ####

  #################################################
  ## REPLACE NON-POLYMORPHIC NON-ASSOCIATED SNPS ##
  #################################################

  ## just working w rows containing inds (not internal nodes)

  ######################################################################################################################################################################
  ######################################################################################################################################################################

  ######################################################################################################################################################################
  ######################################################################################################################################################################

  ####################
  ## NEW while loop ##
  ####################

  counter <- 0
  if(!is.null(seed)) seed.i <- seed

  while(length(toRepeat) > 0){
    # print("LENGTH toREPEAT"); print(length(toRepeat))
    ## for each site, draw the branches to which
    ## you will assign the mts for this site
    ## (~ branch length):

    ## Get vector of FALSEs of length tree$edge.length:
    null.vect <- rep(FALSE, length(tree$edge.length))


    if(max(n.mts[toRepeat]) > length(tree$edge.length)){
      if(!is.null(seed)){
        seed.i <- seed.i+1
        set.seed(seed.i)
      }
      snps.loci <- sapply(c(1:length(n.mts[toRepeat])),
                          function(e)
                            replace(null.vect,
                                    sample(c(1:length(tree$edge.length)),
                                           n.mts[toRepeat][e],
                                           replace=TRUE,
                                           prob=tree$edge.length), TRUE))
    }else{
      if(!is.null(seed)){
        seed.i <- seed.i+1
        set.seed(seed.i)
      }
      snps.loci <- sapply(c(1:length(n.mts[toRepeat])),
                          function(e)
                            replace(null.vect,
                                    sample(c(1:length(tree$edge.length)),
                                           n.mts[toRepeat][e],
                                           replace=FALSE,
                                           prob=tree$edge.length), TRUE))
    }

    ## rearrange snps.loci s.t it becomes a
    ## list of length tree$edge.length,
    ## each element of which contains the
    ## locations of the mutations that will
    ## occur on that branch
    snps.loci <- sapply(c(1:nrow(snps.loci)),
                        function(e) which(snps.loci[e,] == TRUE))


    ## get the node names for all individuals (terminal and internal)
    all.inds <- sort(unique(as.vector(unlist(tree$edge))))
    # we will store the output in a list called snps:
    # snps <- list()
    ## we start w all inds having same genotype as root:
    # snps[all.inds][toRepeat] <- rep(list(gen.root[toRepeat]), length(all.inds))
    for(i in 1:length(all.inds)){
      snps[[all.inds[i]]][toRepeat] <- gen.root[toRepeat]
    }

    ## store replacement nts in list new.nts:
    new.nts <- list()
    ## distinguish btw list of loci and unique list
    snps.loci.ori <- snps.loci
    ## will need to treat repeat loci differently...
    snps.loci.unique <- lapply(snps.loci, unique) # (identical to snps.loci?)


    #############################
    ## For Loop to get new nts ##
    #############################
    for(i in x){
      ## for all snps other than root, we mutate the
      ## genome of the node preceding it, according to snps.loci.
      ## Draw new nts for each locus selected for mutation:
      if(!.is.integer0(snps.loci.unique[[i]])){
        new.nts[[i]] <- !snps[[tree$edge[i,1]]][toRepeat][snps.loci.unique[[i]]]

        ## if any loci are selected for multiple mutations
        ## within their given branch length:
        if(length(snps.loci.ori[[i]]) != length(snps.loci.unique[[i]])){
          ## identify which loci are repeaters
          repeats <- table(snps.loci.ori[[i]])[which(table(snps.loci.ori[[i]])!=1)]
          ## how many times they repeat
          n.reps <- repeats - 1
          ## the positions of these loci in the vector of snps loci
          toRep <- which(snps.loci.unique[[i]] %in% names(repeats))
          ## run chain of re-sampling to end in our new nt for repeater loci:
          foo <- list()
          for(j in 1:length(toRep)){
            foo[[j]] <- new.nts[[i]][toRep[j]]
            for(k in 1:n.reps[j]){
              if(k==1){
                foo[[j]][k] <- !foo[[j]][1]
              }else{
                foo[[j]][k] <- !foo[[j]][k-1]
              }
            }
            ## retain only the last nt selected
            out <- sapply(c(1:length(foo)),
                          function(e) foo[[e]][length(foo[[e]])])
          }
          ## for the loci with repeated mts, replace these positions
          ## in new.nts with the corresponding elements of out, above.
          new.nts[[i]][toRep] <- out
        } # end of if statement for repeaters

        ## update ancestral genotype with new.nts:
        toto <- snps[[tree$edge[i,1]]][toRepeat]
        toto[snps.loci.unique[[i]]] <- new.nts[[i]]
        snps[[tree$edge[i,2]]][toRepeat] <- toto

      }else{
        ## if no mts occur on branch, set genotype of
        ## downstream individual to be equal to ancestor's
        snps[[tree$edge[i,2]]][toRepeat] <- snps[[tree$edge[i,1]]][toRepeat]
      }
    } # end of for loop selecting new nts at mutator loci


    ## temporarily assemble non-associated loci into matrix:
    # temp.new <- do.call("rbind", snps)
    temp <- do.call("rbind", snps)

    ## keep only rows containing terminal individuals:
    # temp <- temp.new[1:n.ind, ]
    temp <- temp[1:n.ind, ]

    ##############################################################
    # temp <- temp.new

    ######################################
    ##### while loop CHECK here: #########
    ######################################
    ## CHECK IF ALL LOCI ARE POLYMORPHIC (|polyThres)

    ## identify n.minor.allele required to meet polyThres:
    polyThres <- 0.01
    n.min <- n.ind*polyThres

    ## make a list of any NON-polymorphic loci:
    toRepeat.ori <- toRepeat
    temp.toRepeat <- temp[, toRepeat.ori]

    ## make a vector of any NON-polymorphic loci:
    ## If ncol = 1:
    if(!is.matrix(temp.toRepeat)){
      csum <- sum(temp.toRepeat)
      if(csum < n.min | csum > (length(temp.toRepeat) - n.min)){
        toRepeat <- toRepeat.ori
      }else{
        toRepeat <- NULL
      }
    }else{
      ## if temp.toRepeat is a true matrix:
      if(ncol(temp.toRepeat) > 0){
        csum <- colSums(temp.toRepeat)
        toRepeat <- toRepeat.ori[which(csum < n.min | csum > (nrow(temp.toRepeat) - n.min))]
      }else{
        csum <- sum(temp.toRepeat)
        if(csum < n.min | csum > (length(temp.toRepeat) - n.min)){
          toRepeat <- toRepeat.ori
        }else{
          toRepeat <- NULL
        }
      }
    }

    counter <- counter+1
    # print("COUNTER"); print(counter)
    # print("toRepeat"); print(length(toRepeat))


  } # end NEW while loop...
  #######################################
  ### while loop ENDS here: #############
  #######################################

  colnames(temp) <- c(1:ncol(temp))

  ######################################################################################################################################################################
  ######################################################################################################################################################################


  #########################
  ## GET ASSOCIATED SNPS ##
  #########################

  ## Need to treat ASSOCIATED SNPs differently:
  ## (non.assoc.snps do NOT need to pass the "while" check;
  ## they just need to match phen.loci at this point.)
  if(n.snps.assoc != 0){
    ## get snps.loci for the ASSOCIATED snps (ie. set to phen.loci) ##
    for(i in 1:n.snps.assoc){
      ## recall: phen.loci contains the tree EDGES on which phen subs occur
      subs.edges <- phen.loci

      ## get nt for root at this locus:
      root.nt <- gen.root[snps.assoc[i]]

      ## get nt for each individual at this locus
      ## assign to (and replace) the snps.assoc elements of loci
      temp[, snps.assoc[i]] <- .get.locus(subs.edges = subs.edges,
                                          root.nt = root.nt,
                                          tree = tree)[1:n.ind]
    }
  } # end of snps.assoc generation


  ###########################################
  ## GET COMPLETE SNPS MATRIX ("snps"): ##
  ###########################################

  ## Remove unnecessary objects...
  rm(snps)
  ## Create snps matrix:
  snps <- temp
  ## Remove unnecessary objects...
  rm(temp)


  ## keep only rows containing terminal individuals:
  snps <- snps[1:n.ind, ]

  ## clear memory
  # gc()


  ####   ###   ###   ###   ###   ###   ###   ###   ###   ###   ###   ###   ####


  ###############################################
  ## MODIFY SNPS.ASSOC ACCORDING TO ASSOC.PROB ##
  ###############################################

  if(n.snps.assoc != 0){
    ## if we have any imperfect associations... ##
    if(any(assoc.prob != 100)){
      ## check length
      if(length(assoc.prob) != n.snps.assoc){
        ## if only 1 prob value given...
        if(length(assoc.prob) == 1){
          ## ... assume uniform assoc.prob;
          # assoc.prob <- rep(assoc.prob, n.snps.assoc)
          ## ... OR, randomly draw assoc.probs from normal dist around assoc.prob;
          assoc.prob <- round(rnorm(n.snps.assoc, assoc.prob, 2), 3)
          ## no warning needed
        }else{
          ## BUT if assoc.prob of random length:
          ## repeat until of length n.snps.assoc
          assoc.prob <- rep(assoc.prob, length.out=n.snps.assoc)
          ## and print warning (only if not of length n.snps.assoc OR 1)
          warning("assoc.prob not of length n.snps.assoc;
                  sequence will be repeated until correct length is reached.")
        }
        } # end checks


      ## re-pseudo-randomise seed:
      if(!is.null(seed)){
        seed.i <- seed*c(1:n.snps.assoc)*10
      }

      ## for each associated SNP,
      ## we undo some associations | assoc.prob for that snp.assoc
      for(i in 1:n.snps.assoc){

        ## re-pseudo-randomise seed:
        if(!is.null(seed)){
          set.seed(seed.i[i])
        }

        prob <- assoc.prob[i]
        ## only if the association is imperfect
        if(prob <= 100){

          # print("prob"); print(prob)
          # print("inv. prob"); print((1 - (prob/100)))

          ## draw snps to change at snps.assoc[i]
          n.toChange <- round(nrow(snps)*(1 - (prob/100)))

          # print("N to change"); print(n.toChange)
          # print("Nrow SNPS"); print(str(snps))

          toChange <- sample(c(1:nrow(snps)), n.toChange)  # , replace=TRUE

          ## change those snps at rows toChange, loci snps.assoc[i]
          for(j in 1:length(toChange)){
            snps[toChange[j], snps.assoc[i]] <-
              selectBiallelicSNP(snps[toChange[j], snps.assoc[i]])
          } # end for loop
        }
      } # end for loop
    } # end any assoc.prob != 100
  } # end modification | assoc.prob


  ##############################
  ## PLOTS & TREECONSTRUCTION ##
  ##############################
  tree.reconstructed <- NULL
  if(heatmap == TRUE || reconstruct!=FALSE){

    ## CONVERT TO CHARACTER: ##
    dna <- snps
    dna <- replace(dna, which(dna == TRUE), "a")
    dna <- replace(dna, which(dna == "FALSE"), "t")

    ## Get DNAbin object:
    dna <- as.DNAbin(dna)
    # rownames(dna) <- c(1:nrow(snps))


    #############
    ## HEATMAP ##
    #############
    if(heatmap==TRUE){
      heatmap.DNAbin(dna=dna,
                     dist.dna.model=dist.dna.model)
    }

    ##########################################
    ## PLOT 2: RECONSTRUCTING THE PHYLOGENY ##
    ##########################################
    if(reconstruct!=FALSE){
      tree.reconstructed <- tree.reconstruct(dna, # [1:n.ind,]
                                           method=reconstruct,
                                           dist.dna.model=dist.dna.model,
                                           plot=TRUE)
    }

    ## Remove unnecessary object:
    rm(dna)

  } # end heatmap, treeconstruction

  ##################
  ## CONVERT SNPS ##
  ##################

  ## (NO LONGER) CONVERT TO NUMERIC: ##
  ## Convert from logical to binary SNPs (for terminal nodes only):
  # snps <- replace(snps, which(snps == TRUE), 1)

  ## Reassort snps.assoc to new columns:
  if(!is.null(snps.assoc)){

    ## update snps.assoc to reflect true loci
    gen.size.final <- ncol(snps)
    snps.assoc.loci.ori <- c((gen.size.final-(n.snps.assoc-1)):gen.size.final)

    #########################################
    ## RANDOMIZE SNPS.ASSOC LOCI POSITIONS ##
    #########################################

    ## Re-enabled snps.assoc loci "randomization" by
    ## just drawing indices and shuffling the columns accordingly...
    ## draw which SNPs will be associated to the phenotype
    if(!is.null(seed)) set.seed(seed)

    snps.assoc.loci <- sort(sample(c(1:gen.size.final),
                                   n.snps.assoc,
                                   replace=FALSE))


    snps.indices <- c(1:gen.size.final)
    snps.ori <- snps

    snps.non.assoc <- snps[,c(1:(gen.size.final-n.snps.assoc))]
    snps.assoc <- snps[,snps.assoc.loci.ori]
    snps.new <- matrix(99, nrow=nrow(snps), ncol=gen.size.final)
    snps.new[,snps.indices[-snps.assoc.loci]] <- snps.non.assoc
    snps.new[,snps.assoc.loci] <- snps.assoc
    snps <- snps.new
    snps.assoc <- snps.assoc.loci

  } # end snps.assoc randomization

  ###############################
  ## Assign row & column names ##
  ###############################

  ## assign/generate row.names
  if(!is.null(row.names)){
    ## If row.names have been provided in args list, assign them:
    if(length(row.names) == nrow(snps)){
      ## match tree$tip.label?
      if(!is.null(tree$tip.label)){
        if(all(row.names %in% tree$tip.label) & all(tree$tip.label %in% row.names)){
          ## REORDER to match tree$tip.labs if possible:
          if(!identical(row.names, tree$tip.label)){
            ord <- match(tree$tip.label, row.names)
            row.names <- row.names[ord]
          }
        }
      }
      rownames(snps) <- row.names
    }else{
      if(is.null(rownames(snps))) rownames(snps) <- c(1:nrow(snps))
    }
  }else{
    ## Otherwise, try to assign rownames(snps) to match tree$tip.label:
    if(!is.null(tree$tip.label)){
      if(length(tree$tip.label) == nrow(snps)){
        rownames(snps) <- tree$tip.label
      }else{
        rownames(snps) <- 1:nrow(snps)
        warning("The length of tree$tip.label was not equal to nrow(snps) being simulated;
              rownames(snps) have been set to 1:N and will not match tree$tip.label.")
      }
    }
  }

  ## generate column names:
  colnames(snps) <- 1:ncol(snps)


  #################################################
  ## SIM SET 2 (complementary clade-wise assoc): ##
  #################################################
  sets <- NULL
  if(!is.null(snps.assoc)){
    if(!is.null(set)){
      if(set == 2){

        ## Want to divide tree into 2 sets of clades btw 1/3:2/3 and 1/2:1/2
        clades <- tab <- grp.options <- sets.complete <- list()

        min.size <- ceiling((n.ind)*(1/3))
        max.size <- floor((n.ind)*(2/3))
        grp1 <- n.ind

        ## get 2 sets of clades:

        ##########################################
        ## Pick SETS for ANY TREE (pretty sure) ##
        ##########################################
        dec <- grp <- sets.temp <- sets.complete <- list()

        inds <- c(1:(n.ind))
        # new.root <- tree$edge[1,1] # initial root
        new.root <- n.ind+1 # initial root

        counter <- 0
        #######################################
        ## WHILE LOOP to get size of clades: ##
        #######################################
        while(grp1 < min.size | grp1 > max.size){

          ## get all descendants of root node:
          all.dec <- .getDescendants(tree, node=new.root)

          ## get all descendants in first 2 major clades:
          dec[[1]] <- .getDescendants(tree, node=all.dec[1])
          dec[[2]] <- .getDescendants(tree, node=all.dec[2])

          ## get terminal inds only:
          sets.temp[[1]] <- dec[[1]][which(dec[[1]] %in% inds)]
          sets.temp[[2]] <- dec[[2]][which(dec[[2]] %in% inds)]

          grp[[1]] <- length(sets.temp[[1]])
          grp[[2]] <- length(sets.temp[[2]])

          max.grp <- which.max(c(grp[[1]], grp[[2]]))
          new.root <- all.dec[max.grp]

          set1 <- sets.temp[[max.grp]]

          sets <- rep(2, length(inds))
          sets <- replace(sets, set1, 1)
          names(sets) <- tree$tip.label

          counter <- counter+1

          grp1 <- grp[[max.grp]]

        } # end while loop


        set1 <- names(sets)[which(sets == 1)]
        set2 <- names(sets)[which(sets == 2)]
        ###########

        ########################
        ## MODIFY SNPS.ASSOC: ##
        ########################
        snps.assoc.set1 <- 1:round(length(snps.assoc)/2)
        snps.assoc.set2 <- (round(length(snps.assoc)/2)+1):length(snps.assoc)

        ## replace set1 snps with 0 at all inds in clade.set1:
        for(e in 1:length(snps.assoc.set1)){
          # snps[which(rownames(snps) %in% set1), snps.assoc[snps.assoc.set1[e]]] <- 0
          snps[which(rownames(snps) %in% set1), snps.assoc[snps.assoc.set1[e]]] <- FALSE
        }
        ## replace set2 snps with 0 at all inds in clade.set2:
        for(e in 1:length(snps.assoc.set2)){
          # snps[which(rownames(snps) %in% set2), snps.assoc[snps.assoc.set2[e]]] <- 0
          snps[which(rownames(snps) %in% set2), snps.assoc[snps.assoc.set2[e]]] <- FALSE
        }
      }
    }
  } # end sim set 2


  ##################
  ## get RESULTS: ##
  ##################
  out <- list(snps, snps.assoc, tree.reconstructed, sets)
  names(out) <- c("snps", "snps.assoc", "tree.reconstructed", "sets")

  return(out)

} # end snp.sim








################
## .get.locus ##
################
.get.locus <- function(subs.edges, root.nt, tree){

  ##################
  ## HANDLE TREE: ##
  ##################
  ## Always work with trees in "pruningwise" order:
  tree <- reorder.phylo(tree, order="pruningwise")
  ## Trees must be rooted:
  if(!is.rooted(tree)) tree <- midpoint(tree)

  ####################################################################
  ############################
  ## Get Anc-Des EDGE ORDER ##
  ############################
  ## Get sequence from lowest ("root", Nterm+1) to highest ancestral node:
  ix <- c(min(tree$edge[,1]):max(tree$edge[,1]))
  ## Get for loop index of rows in tree$edge[,1], in pairs, from lowest to highest:
  x <- as.vector(unlist(sapply(c(1:length(ix)), function(e) which(tree$edge[,1] == ix[e]))))
  ####################################################################


  ## convert subs.edges into appropriate format:
  snps.loci <- list()
  snps.loci[[1]] <- subs.edges

  ## rearrange snps.loci s.t it becomes a
  ## list of length tree$edge.length,
  ## each element of which contains the
  ## locations of the mutations that will
  ## occur on that branch
  snps.loci <- sapply(c(1:length(tree$edge.length)),
                      function(f)
                        seq_along(snps.loci)[sapply(snps.loci,
                                                    function(e) f %in% e)])


  # we will store the output in a list called locus:
  locus <- list()
  ## get the node names for all individuals (terminal and internal)
  all.inds <- sort(unique(as.vector(unlist(tree$edge))))
  ## we start w all inds having same genotype as root:
  for(j in all.inds){
    locus[[j]] <- root.nt
  }

  ## store replacement nts in list new.nts:
  new.nts <- list()
  ## distinguish btw list of loci and unique list
  snps.loci.ori <- snps.loci
  ## will need to treat repeat loci differently...
  snps.loci.unique <- lapply(snps.loci, unique)


  #############################
  ## For Loop to get new nts ##
  #############################
  for(i in x){
    ## for all locus other than root, we mutate the
    ## genome of the node preceding it, according to snps.loci.
    ## Draw new nts for each locus selected for mutation:
    if(!.is.integer0(snps.loci.unique[[i]])){
      new.nts[[i]] <- !locus[[tree$edge[i,1]]][snps.loci.unique[[i]]]

      ## if any loci are selected for multiple mutations
      ## within their given branch length:
      if(length(snps.loci.ori[[i]]) != length(snps.loci.unique[[i]])){
        ## identify which loci are repeaters
        repeats <- table(snps.loci.ori[[i]])[which(table(snps.loci.ori[[i]])!=1)]
        ## how many times they repeat
        n.reps <- repeats - 1
        ## the positions of these loci in the vector of snps loci
        toRepeat <- which(snps.loci.unique[[i]] %in% names(repeats))
        ## run chain of re-sampling to end in our new nt for repeater loci:
        foo <- list()
        for(j in 1:length(toRepeat)){
          foo[[j]] <- new.nts[[i]][toRepeat[j]]
          for(k in 1:n.reps[j]){
            if(k==1){
              foo[[j]][k] <- !foo[[j]][1]

            }else{
              foo[[j]][k] <- !foo[[j]][k-1]
            }
          }
          ## retain only the last nt selected
          out <- sapply(c(1:length(foo)),
                        function(e) foo[[e]][length(foo[[e]])])
        }
        ## for the loci with repeated mts, replace these positions
        ## in new.nts with the corresponding elements of out, above.
        new.nts[[i]][toRepeat] <- out
      } # end of if statement for repeaters

      ## update ancestral genotype with new.nts:
      temp <- locus[[tree$edge[i,1]]]
      temp[snps.loci.unique[[i]]] <- new.nts[[i]]
      locus[[tree$edge[i,2]]] <- temp

    }else{
      ## if no mts occur on branch, set genotype of
      ## downstream individual to be equal to ancestor's
      locus[[tree$edge[i,2]]] <- locus[[tree$edge[i,1]]]
    }
  } # end of for loop selecting new nts at mutator loci

  ## turn locus into a vector for easier post-handling
  locus <- as.vector(unlist(locus))

  ##############################################################
  ## temporarily assemble non-associated loci into matrix:
  # temp <- do.call("rbind", locus)

  ## keep only rows containing terminal individuals:
  # temp <- temp[1:n.ind, ]
  ##############################################################

  return(locus)

} # end .get.locus











# #####################
# ## .get.locus.char ##
# #####################
# .get.locus.char <- function(subs.edges, root.nt, tree){
#
#   ##################
#   ## HANDLE TREE: ##
#   ##################
#   ## Always work with trees in "pruningwise" order:
#   tree <- reorder.phylo(tree, order="pruningwise")
#   ## Trees must be rooted:
#   if(!is.rooted(tree)) tree <- midpoint(tree)
#
#   ####################################################################
#   ############################
#   ## Get Anc-Des EDGE ORDER ##
#   ############################
#   ## Get sequence from lowest ("root", Nterm+1) to highest ancestral node:
#   ix <- c(min(tree$edge[,1]):max(tree$edge[,1]))
#   ## Get for loop index of rows in tree$edge[,1], in pairs, from lowest to highest:
#   x <- as.vector(unlist(sapply(c(1:length(ix)), function(e) which(tree$edge[,1] == ix[e]))))
#   ####################################################################
#
#
#   ## convert subs.edges into appropriate format:
#   snps.loci <- list()
#   snps.loci[[1]] <- subs.edges
#
#   ## rearrange snps.loci s.t it becomes a
#   ## list of length tree$edge.length,
#   ## each element of which contains the
#   ## locations of the mutations that will
#   ## occur on that branch
#   snps.loci <- sapply(c(1:length(tree$edge.length)),
#                       function(f)
#                         seq_along(snps.loci)[sapply(snps.loci,
#                                                     function(e) f %in% e)])
#
#
#   # we will store the output in a list called locus:
#   locus <- list()
#   ## get the node names for all individuals (terminal and internal)
#   all.inds <- sort(unique(as.vector(unlist(tree$edge))))
#   ## we start w all inds having same genotype as root:
#   for(j in all.inds){
#     locus[[j]] <- root.nt
#   }
#
#   ## store replacement nts in list new.nts:
#   new.nts <- list()
#   ## distinguish btw list of loci and unique list
#   snps.loci.ori <- snps.loci
#   ## will need to treat repeat loci differently...
#   snps.loci.unique <- lapply(snps.loci, unique)
#
#
#   #############################
#   ## For Loop to get new nts ##
#   #############################
#   for(i in x){
#     ## for all locus other than root, we mutate the
#     ## genome of the node preceding it, according to snps.loci.
#     ## Draw new nts for each locus selected for mutation:
#     if(!.is.integer0(snps.loci.unique[[i]])){
#       new.nts[[i]] <- sapply(c(1:length(snps.loci.unique[[i]])), function(e)
#         selectBiallelicSNP(c("a", "c", "g", "t")[which(c("a", "c", "g", "t")
#                                                        %in% locus[[tree$edge[i,1]]]
#                                                        [snps.loci.unique[[i]][e]])]))
#       ## if any loci are selected for multiple mutations
#       ## within their given branch length:
#       if(length(snps.loci.ori[[i]]) != length(snps.loci.unique[[i]])){
#         ## identify which loci are repeaters
#         repeats <- table(snps.loci.ori[[i]])[which(table(snps.loci.ori[[i]])!=1)]
#         ## how many times they repeat
#         n.reps <- repeats - 1
#         ## the positions of these loci in the vector of snps loci
#         toRepeat <- which(snps.loci.unique[[i]] %in% names(repeats))
#         ## run chain of re-sampling to end in our new nt for repeater loci:
#         foo <- list()
#         for(j in 1:length(toRepeat)){
#           foo[[j]] <- new.nts[[i]][toRepeat[j]]
#           for(k in 1:n.reps[j]){
#             if(k==1){
#               foo[[j]][k] <- selectBiallelicSNP(c("a", "c", "g", "t")[which(c("a", "c", "g", "t")
#                                                                             %in% foo[[j]][1])])
#
#             }else{
#               foo[[j]][k] <- selectBiallelicSNP(c("a", "c", "g", "t")[which(c("a", "c", "g", "t")
#                                                                             %in% foo[[j]][k-1])])
#             }
#           }
#           ## retain only the last nt selected
#           out <- sapply(c(1:length(foo)),
#                         function(e) foo[[e]][length(foo[[e]])])
#         }
#         ## for the loci with repeated mts, replace these positions
#         ## in new.nts with the corresponding elements of out, above.
#         new.nts[[i]][toRepeat] <- out
#       } # end of if statement for repeaters
#
#       ## update ancestral genotype with new.nts:
#       temp <- locus[[tree$edge[i,1]]]
#       temp[snps.loci.unique[[i]]] <- new.nts[[i]]
#       locus[[tree$edge[i,2]]] <- temp
#
#     }else{
#       ## if no mts occur on branch, set genotype of
#       ## downstream individual to be equal to ancestor's
#       locus[[tree$edge[i,2]]] <- locus[[tree$edge[i,1]]]
#     }
#   } # end of for loop selecting new nts at mutator loci
#
#   ## turn locus into a vector for easier post-handling
#   locus <- as.vector(unlist(locus))
#
#   return(locus)
#
# } # end .get.locus.char
caitiecollins/treeWAS documentation built on March 9, 2024, 3:15 p.m.