R/make_Contsimmap.R

Defines functions make.contsimmap sim.conthistory

Documented in make.contsimmap sim.conthistory

#2/13/23 Parameter formatting:
# - Generally expects list arrays with each column corresponding to a state (mu, Xsig2), tip (Ysig2, nobs), or trait (X0)
#    - Also accepts matrices in the case of X0/nobs
#    - Also accepts lists of lists, where each sublist represents a column
# - Unnamed elements readily recycled to fill missing portions of inputs, but if no unnamed elements are found, will be
#   filled in with defaults (identity for Xsig2, 0s for mu/Ysig2/nobs/X0)
#    - Entries of nobs corresponding to internal nodes are never derived from recycling and always default to 0 if unspecified


#' @rdname make.contsimmap
#' 
#' @order 2
#' 
#' @export
sim.conthistory<-function(tree,ntraits=1,traits=paste0('trait_',seq_len(ntraits)),nobs=NULL,
                          nsims=100,res=100,
                          X0=NULL,Xsig2=NULL,Ysig2=NULL,mu=NULL,
                          verbose=FALSE){
  #takes care of all the necessary formatting work...
  list2env(.prep.contsimmap(tree,nsims,res,X0,Xsig2,Ysig2,mu,conditional=FALSE,ntraits,traits,nobs),envir=environment())
  
  #at some point want to generalize to have multiple nobs per call to simulate differently-structured trait.data...
  #a little confusing at this point since the trait.data lookup element refers to X0 here, but trait.data for make.contsimmap
  #but that can be resolved manually for now
  #^2/14/23: need to look up if the above still holds
  #Can now handle multiple nobs, but things are still a bit confusing to me with how lookups are handled
  #Should eventually become more concrete/organized as code matures
  tmp<-.uncond.traversals(prune.seq,anc,tree[[1]][['edge']],Ntip(tree[[1]]),
                          maps,
                          X0,nobs,Xsig2,Ysig2[,Yperm,drop=FALSE],mu,lookup,
                          nts,seed,
                          verbose=verbose)
  
  #format output
  x<-tmp[[1]]
  trait.data<-tmp[[2]]
  attr(x,'ts')<-ts
  attr(x,'tree')<-tree
  attr(x,'maps')<-maps
  attr(x,'treeID')<-treeID
  traits<-colnames(Xsig2[[1]])
  attr(x,'traits')<-setNames(rep(1,length(traits)),traits)
  #need to first reconfigure lookup to refer to include newly simulated trait.data...
  counter<-1
  for(i in seq_along(lookup)){
    tmp.n<-length(lookup[[i]][['matches']])
    lookup[[i]][['table']]<-lookup[[i]][['table']][lookup[[i]][['matches']],,drop=FALSE]
    lookup[[i]][['matches']]<-seq_len(tmp.n)
    lookup[[i]][['table']][,1]<-counter:(counter+tmp.n-1)
    counter<-counter+tmp.n
  }
  attr(x,'params')<-matrix(list(trait.data,Xsig2,Ysig2,mu,lookup,list(fxn="sim.conthistory")),
                           6,1,
                           dimnames=list(c('trait.data','Xsig2','Ysig2','mu','lookup','call_info'),NULL))
  .uncompress(x)
}

#may want to make some function for auto-estimation of Xsig2/Ysig2

#' Make continuous stochastic character maps
#' 
#' These functions create continuous stochastic character maps (class 
#' "\code{contsimmap}") by simulating and mapping evolving continuous 
#' characters on a given phylogeny. These maps may be conditioned on
#' observed trait data (\code{make.contsimmap()}) or used to simulate
#' trait data (\code{sim.conthistory()}).
#' 
#' 
#' 
#' @param tree A phylogeny or list of phylogenies with or without mapped 
#' discrete characters (classes "\code{phylo}", "\code{multiPhylo}", 
#' "\code{simmap}", or "\code{multiSimmap}"). Lists of phylogenies with 
#' differing discrete character histories are allowed, but lists with 
#' \emph{differing topologies are currently not supported}! I plan to 
#' implement "\code{multiContsimmap}"-type objects for this purpose in the 
#' future.
#' 
#' 
#' @param trait.data A numeric matrix/vector or data frame specifying
#' observed trait measurements on which to condition simulations. Generally,
#' rows should correspond to different observations while columns correspond 
#' to different traits (vectors can only supply data for a single trait). Any 
#' tip/node in the phylogeny may be associated with 0 or more observations; use 
#' \code{NA} to specify missing trait measurements in the case of partial 
#' observations (i.e., only a subset of traits were measured in a given 
#' observation). 
#' 
#' To assign observations to tips/nodes, the data \emph{must} be
#' labelled in some way according to 
#' \code{tree$tip.label}/\code{tree$node.label} (node labels default to their 
#' numeric index if not provided in \code{tree$node.label}). These labels must
#' be provided as names for vectors, rownames for matrices, and 
#' a column of factor/character data for data frames. In the case of data 
#' frames, if multiple columns with factor/character data are found, the column 
#' with the most matches to \code{tree$tip.label}/\code{tree$node.label} is 
#' automatically chosen (any rownames are converted to a column before this 
#' step). Trait names are simply taken from the column names of matrices/data 
#' frames, and default to "\code{trait_<i>}" for the \code{<i>}th column of 
#' trait data if not provided. 
#' 
#' To specify different trait data for each 
#' phylogeny/simulation, format \code{trait.data} instead as a list of 
#' vectors/matrices/data frames (see Recycling Behavior section for further 
#' explanation of parameter recycling across phylogenies/simulations). 
#' 
#' Note 
#' that, in the case of \code{sim.conthistory()}, trait data is \emph{simulated}
#' rather than provided as an \emph{input}. In this case, the number and names
#' of traits are controlled by the arguments \code{ntraits} and \code{traits},
#' respectively. The number of observations per tip/node may be specified via
#' the argument \code{nobs}, while the \code{X0} argument determines the
#' "starting" trait values at the root of the phylogeny.
#' 
#' 
#' @param ntraits The number traits to simulate.
#' 
#' 
#' @param traits A character vector specifying the names of each trait. The 
#' \code{<i>}th unnamed trait defaults to "\code{trait_<i>}".
#' 
#' 
#' @param nobs A numeric vector specifying the number of observations to 
#' simulate per tip/node. 
#' 
#' Entries are assigned to tips/nodes if named according 
#' to \code{tree$tip.label}/\code{tree$node.label} (node labels default to 
#' their numeric index if not provided in \code{tree$node.label}). 
#' Unnamed/unassigned entries are recycled to "fill in" values for any tips 
#' lacking input (\emph{but not internal nodes}) in order of increasing node 
#' index. If no unnamed/unassigned entries are available, tips and internal 
#' nodes default to 1 and 0 observations, respectively. 
#' 
#' To specify differing 
#' numbers of observations for each phylogeny/simulation, format \code{nobs} 
#' instead as a list of vectors or matrix with each column/column name 
#' corresponding to different tips/nodes (see Recycling Behavior section for further 
#' explanation of parameter recycling across phylogenies/simulations).
#' 
#' 
#' @param X0 A numeric vector specifying the starting trait values at the root 
#' of the phylogeny. 
#' 
#' Entries are assigned to traits if named according to 
#' \code{traits}. Unnamed/unassigned entries are recycled to "fill in" values 
#' for any traits lacking input in the same order as given in \code{traits}. 
#' If no unnamed/unassigned entries are available, defaults to 0. 
#' 
#' To specify 
#' differing starting trait values for each phylogeny/simulation, format 
#' \code{X0} instead as a list of vectors or matrix with each column/column 
#' name corresponding to different traits (see Recycling Behavior section for 
#' more further explanation of parameter recycling across
#' phylogenies/simulations).
#' 
#' 
#' @param nsims The number of simulations to perform. Each simulation will 
#' correspond to a separate phylogeny in \code{tree}, which is recycled as 
#' needed. For example, providing a "\code{multiSimmap}" object of length 3 
#' for \code{tree} and specifying \code{nsims=7} will result in 7 simulations 
#' on phylogenies 1, 2, 3, 1, 2, 3, and 1, in that order.
#' 
#' 
#' @param res Controls the approximate number of timepoints at which to sample 
#' trait values across the entire height of the phylogeny (i.e., from root to 
#' last-surviving tip). Higher values result in more densely-sampled character 
#' histories but take longer to simulate and use more computer memory.
#' 
#' 
#' @param Xsig2 A list of numeric matrices/vectors specifying the evolutionary 
#' rate matrices for each discrete character state mapped onto the phylogeny. 
#' Evolutionary rate matrices are symmetric matrices with diagonal and 
#' off-diagonal entries corresponding to evolutionary rates and covariances, 
#' respectively. Vectors are assumed to be diagonal matrices (i.e., no 
#' evolutionary covariance among traits). 
#' 
#' Matrix/vector entries are assigned 
#' to pairs of traits based on associated row/column names (plain names in the 
#' case of vectors), which are matched to \code{trait}. When possible, 
#' unspecified matrix entries (\code{NA} entries) and row/column names are 
#' automatically set to make inputted matrices symmetric. Any remaining 
#' unnamed/unassigned rows/columns are recycled as a block-diagonal matrix to 
#' "fill in" values for traits completely lacking input in the same order as 
#' given in \code{traits}. All remaining unspecified rates and covariances 
#' default to 1 and 0, respectively. Specifying an invalid variance-covariance 
#' matrices (either due to asymmetry or not being positive semidefinite) 
#' results in an error. 
#' 
#' List elements are assigned to discrete character states 
#' if named according to the state names given in \code{tree}. Any 
#' unnamed/unassigned list elements are recycled to "fill in" for states 
#' lacking input in alphanumeric order. If no unnamed/unassigned list elements 
#' are available, defaults to identity matrix (i.e., all rates of 1 with no 
#' covariance). 
#' 
#' To specify differing evolutionary rate matrices for each 
#' phylogeny/simulation, format \code{Xsig2} instead as a list of lists or 
#' matrix-shaped list with each column/column name corresponding to different 
#' discrete character states (see Recycling Behavior section for further 
#' explanation of parameter recycling across phylogenies/simulations).
#' 
#' 
#' @param Ysig2 A list of numeric matrices/vectors specifying the intraspecifc 
#' and/or measurement error for each tip/node in the phylogeny. These are 
#' symmetric matrices with diagonal and off-diagonal entries corresponding to 
#' the variances and covariances, respectively, of trait measurements for a 
#' particular tip/node. Vectors are assumed to be diagonal matrices (i.e., no 
#' covariance among trait measurements within a given node). 
#' 
#' Matrix/vector 
#' entries are assigned to pairs of traits based on associated row/column names 
#' (plain names in the case of vectors), which are matched to \code{trait}. 
#' When possible, unspecified matrix entries (\code{NA} entries) and row/column 
#' names are automatically set to make inputted matrices symmetric. Any 
#' remaining unnamed/unassigned rows/columns are recycled as a block-diagonal
#' matrix to "fill in" values for traits completely lacking input in the same 
#' order as given in \code{traits}. All remaining unspecified matrix entries 
#' default to 0. Specifying an invalid variance-covariance matrices (either 
#' due to asymmetry or not being positive semidefinite) results in an error. 
#' 
#' List elements are assigned to tips/nodes if named according to 
#' \code{tree$tip.label}/\code{tree$node.label} (node labels default to their 
#' numeric index if not provided in \code{tree$node.label}). Any 
#' unnamed/unassigned list elements are recycled to "fill in" for 
#' nodes/tips lacking input in order of increasing node index. If no 
#' unnamed/unassigned list elements are available, defaults to matrix of 0s. 
#' 
#' To 
#' specify differing intraspecific/measurement errors for each 
#' phylogeny/simulation, format \code{Ysig2} instead as a list of lists or 
#' matrix-shaped list with each column/column name corresponding to different 
#' tips/nodes (see Recycling Behavior section for further 
#' explanation of parameter recycling across phylogenies/simulations).
#' 
#' 
#' @param mu A list of numeric vectors specifying evolutionary trends for each 
#' discrete character state mapped onto the phylogeny. 
#' 
#' Vector entries are 
#' assigned to traits if named according to  \code{traits}. Unnamed/unassigned 
#' entries are recycled to "fill in" values for any traits lacking input in 
#' the same order as given in \code{traits}. If no unnamed/unassigned entries 
#' are available, defaults to 0. 
#' 
#' List elements are assigned to discrete 
#' character states if named according to the state names given in \code{tree}. 
#' Any  unnamed/unassigned list elements are recycled to "fill in" for states 
#' lacking input in alphanumeric order. If no unnamed/unassigned list elements 
#' are available, defaults to vector of 0s. 
#' 
#' To specify differing trends for 
#' each phylogeny/simulation, format \code{mu} instead as a list of vectors or 
#' matrix with each column/column name corresponding to different discrete 
#' character states (see Recycling Behavior section for  more further 
#' explanation of parameter recycling across phylogenies/simulations).
#' 
#' 
#' @param verbose \code{TRUE} or \code{FALSE}: do you want to print progress 
#' bars to monitor the progress of the contsimmapping algorithm? Defaults to
#' \code{FALSE}.
#' 
#' 
#' 
#' @return An object of class "\code{contsimmap}", which is essentially a
#' fancy 3D array-shaped list with rows corresponding to edges, columns
#' to traits, and slices to simulations. Each element in the list is a vector
#' of simulated continuous trait values. Thus, to limit investigation to 
#' particular edge(s) \code{i}, trait(s) \code{j}, and/or simulation(s) 
#' \code{k}, one may simply subset \code{contsimmap} objects using standard
#' indexing notation, e.g., \code{contsimmap[i, j, k]} (as with base R, one 
#' may omit indices to avoid subsetting along a particular dimension, e.g.,
#' \code{contsimmap[i, , k]}). Lists of "edgewise information" for a 
#' \emph{single} edge/trait/simulation combination may be directly extracted 
#' using double brackets, e.g., \code{contsimmap[[i, j, k]]}. This yields a list of
#' 3 components:
#' \itemize{
#'  \item{\code{values}: the simulated continuous trait values, including the 
#'  value at the node immediately ancestral to edge \code{i} for convenience.}
#'  \item{\code{ts}: the time points at which the simulated continuous trait 
#'  values occur.}
#'  \item{\code{states}: the discrete character states at each time point (and 
#'  their preceding time interval).}
#' }
#' Lists of multiple "edgewise information lists" may be extracted using a 
#' single vector/matrix of indices with single brackets as in base R, e.g., \code{contsimmap[1:3]}, 
#' \code{contsimmap[cbind(i, j, k)]}, etc.
#' 
#' Notably, \code{contsimmap} objects also store a lot of additional data as
#' object "attributes". These consist of data necessary for downstream 
#' analyses, as well as cached calculations that are simply annoying to 
#' recompute on the fly:
#' \itemize{
#'  \item{\code{ts}: a list of numeric vectors specifying the main time points
#'  at which continuous trait values are sampled along each edge. List elements 
#'  are named and ordered according to edge index. Notably, these vectors 
#'  exclude any time points corresponding to discrete character state shifts, 
#'  and thus apply across all phylogenies/simulations. }
#'  \item{\code{tree}: the list of phylogenies along on which continuous trait 
#'  values are mapped. The inputted \code{tree} argument is always coerced to
#'  an object of class "\code{multiSimmap}" (these include a single discrete
#'  character state named "\code{0}" if no mapped discrete character states
#'  were originally provided).}
#'  \item{\code{maps}: this attribute is mainly for caching calculations 
#'  that would otherwise have to be tediously recomputed from the \code{ts} and
#'  \code{tree} attributes. Thus, I may look into allowing users to discard
#'  this attribute in cases where computer memory is a concern, though the
#'  package isn't set up for this yet. In any case, users need not bother with
#'  accessing/messing with this attribute, but for completeness: this is a 3D
#'  array-shaped list with rows corresponding to edges, columns to the different
#'  phylogenies in the \code{tree} attribute, and slices to various stored 
#'  quantities. These stored quantities are:
#'  \itemize{
#'    \item{\code{coarse}: numeric vectors providing the length of each time interval
#'    spent in a discrete character state along each edge, similar to the
#'    "\code{maps}" element within normal \code{simmap} objects as implemented 
#'    in the \bold{phytools} package.}
#'    \item{\code{ts}: numeric vectors of \emph{all} time points at which continuous
#'    trait values are sampled along each edge, now including those pesky extra 
#'    time points corresponding to discrete character state shifts.}
#'    \item{\code{dts}: numeric vectors of the time intervals between each time point
#'    along each edge and the previous time point. In the case of the first
#'    time point, the previous time point is simply the height of the node
#'    immediately ancestral to the edge.}
#'    \item{\code{bb.dts}: numeric vectors of the time intervals between each time
#'    point and the previous time point divided by the intervals between each
#'    time point and the next "critical time point" (i.e., time points 
#'    corresponding to either the nodes in the phylogeny or discrete character 
#'    state shifts) along each edge. Very helpful for Brownian Motion bridge
#'    calculations.}
#'    \item{\code{bb.sds}: numeric vectors of the standard deviations of  
#'    continuous trait values at each time point along each edge under a
#'    Brownian Motion model with rate 1 and conditioned on fixed continuous
#'    trait values at both the immediately previous time point and next critical time point. Again 
#'    very helpful for Brownian Motion bridge calculations.}
#'    \item{\code{state}: character vectors of discrete character states at
#'    each time point (and its preceding time interval) along each edge.}
#'    \item{\code{incl}: logical vectors returning \code{FALSE} for each time
#'    point corresponding to a discrete character state shift along each edge. 
#'    Helps quickly determine which time points are "main" time points
#'    repeated across all phylogenies/simulations and which aren't.}
#'    \item{\code{inds}: numeric vectors of the indices of critical time
#'    points along each edge.}
#'    }
#'  }
#'  \item{\code{traits}: again, users should generally not view/mess with this 
#'  attribute, which is simply a named vector encoding where the information
#'  about a given trait is stored in the \code{params} attribute.}
#'  \item{\code{params}: this is a pretty critical attribute storing 
#'  information regarding how the different traits in the \code{contsimmap}
#'  object were constructed, but it is hard to interpret directly. Users
#'  wanting to check the parameters used to simulate different traits should
#'  instead use the function \code{get.param.info()}. In any case, this is a
#'  matrix-shaped list with each row corresponding to different stored
#'  information and columns to different groups of traits constructed at the
#'  same time (as encoded in the \code{traits} attribute). The rows are 
#'  specifically:
#'  \itemize{
#'    \item{\code{trait.data}: either the trait data upon which simulated 
#'    continuous trait values were conditioned (i.e., in the case of 
#'    \code{make.contsimmap()}) or the trait data simulated by 
#'    \code{sim.conthistory()}. This is provided in a list of matrices format, 
#'    with list elements corresponding to different phylogenies/simulations, 
#'    matrix rows to observations, and matrix columns to traits.}
#'    \item{\code{Xsig2}: evolutionary rate matrices used to simulate
#'    trait values in matrix-shaped list format, with rows corresponding to
#'    different phylogenies/simulations and columns to discrete character
#'    states.}
#'    \item{\code{Ysig2}: intraspecific variance/measurement error 
#'    matrices used to simulate trait values in matrix-shaped list format, with 
#'    rows corresponding to different phylogenies/simulations and columns to 
#'    tips/nodes.}
#'    \item{\code{mu}: evolutionary trends used to simulate
#'    trait values in matrix-shaped list format, with rows corresponding to
#'    different phylogenies/simulations and columns to discrete character
#'    states.}
#'    \item{\code{lookup}: matrix of indices with each row corresponding to a 
#'    different simulation and columns to \code{trait.data}, \code{Xsig2}, 
#'    \code{Ysig2}, and \code{mu}. This conveys which parameters were used to 
#'    produce each simulation.}
#'    \item{\code{call_info}: a list of information about how groups of traits
#'    were constructed--mainly including the component \code{fxn}, which 
#'    provides the name of the function used to construct traits. More 
#'    complicated trait constructions will generally yield
#'    additional components here, such as the formula used to define 
#'    traits in calls to \code{make.traits()}, the names of traits used to 
#'    modify evolutionary parameters in calls to \code{diffusion()}, the 
#'    breakpoints/state names used to convert continuous traits into 
#'    discrete ones in calls to \code{threshold()}, etc.}
#'    }
#'  }
#' }
#' 
#' @section Recycling Behavior:
#' Each parameter input system (\code{trait.data}, \code{nobs}, \code{X0},
#' \code{Xsig2}, \code{Ysig2}, and \code{mu}) has certain idiosyncrasies, but 
#' they all follow a similar philosophy. There are three steps:
#' \enumerate{
#'  \item{Match labeled inputs to appropriate nodes/traits/states.}
#'  \item{"Fill in" for nodes/traits/states lacking inputs with unlabeled 
#'  inputs if they exist (recycling the unlabeled inputs as needed) and default 
#'  values otherwise.}
#'  \item{If multiple parameter values are provided, different parameter values 
#'  will be applied to each phylogeny in \code{tree}, recycling as necessary. 
#'  This creates some interesting quirks when the length of different parameter 
#'  values exceeds the number of phylogenies. For example, if \code{tree} 
#'  contains 3 phylogenies and 4 lists of matrices are specified for 
#'  \code{Xsig2}, then simulations on the 2nd and 3rd phylogenies will use the 
#'  2nd and 3rd \code{Xsig2} lists, respectively, while simulations on the 1st 
#'  phylogeny will alternate between the 1st and 4th \code{Xsig2} lists.}
#' }
#' I tried to make warning messages informative enough to make potentially 
#' unexpected recycling behaviors apparent. Users specifying complicated
#' simulations involving varying parameter values can always double-check their 
#' inputs were interpreted correctly using \code{get.param.info()}.
#' 
#' @order 1
#' 
#' @export
make.contsimmap<-function(tree,trait.data,
                          nsims=100,res=100,
                          Xsig2=NULL,Ysig2=NULL,mu=NULL,
                          verbose=FALSE){
  #takes care of all the necessary formatting work...
  list2env(.prep.contsimmap(tree,nsims,res,trait.data,Xsig2,Ysig2,mu,TRUE),envir=environment())
  
  #does the post/preorder traversals, returns final simulated trait data
  x<-.cond.traversals(prune.seq,anc,des,ndes,
                      maps,
                      parsed.obs,parsed.mis,nobs,Xsig2,Ysig2[,Yperm,drop=FALSE],mu,lookup,
                      nts,NTS,t1s,seed,x,v,dx,dv,
                      verbose=verbose)
  
  #format output
  attr(x,'ts')<-ts
  attr(x,'tree')<-tree
  attr(x,'maps')<-maps
  attr(x,'treeID')<-treeID
  traits<-colnames(Xsig2[[1]])
  attr(x,'traits')<-setNames(rep(1,length(traits)),traits)
  attr(x,'params')<-matrix(list(trait.data,Xsig2,Ysig2,mu,lookup,list(fxn="make.contsimmap")),
                           6,1,
                           dimnames=list(c('trait.data','Xsig2','Ysig2','mu','lookup','call_info'),NULL))
  .uncompress(x)
}
bstaggmartin/contsimmap documentation built on Aug. 12, 2024, 5:16 a.m.