# R/simulate.R In zhenkewu/lotR: Latent class analysis with Observed Trees in R (lotR)

#### Documented in compute_misrateopt_colpermBpermutationsrmse_bias_AB

#######################################################################
# Simulations
#  1) specify/read in tree structure
#  2) simulate Gaussian diffusion along the trees
#  3) simulate the data (multivariate binary at the tips)
#######################################################################

#' Simulate data and subject-specific indicators from latent class models
#'
#'
#' @param n sample size
#' @param itemprob item probabilities
#' @param classprob class probabilities
#' @param fit fitted object
#'
#'
#' @return a list; x: data; z a vector of integer indicators
#' of class membership.
#'
#' @examples
#'
#' lotR_blca(10,matrix(0.5,nrow=3,ncol=20),c(0.98,0.01,0.01))
#'
#' @seealso [BayesLCA::blca()]
#' @importFrom stats rmultinom runif
#' @export
lotR_blca <- function (n, itemprob = 0.5, classprob = 1, fit = NULL)
{
if (is.null(fit)) {
itemprob <- as.matrix(itemprob)
G <- nrow(itemprob)
M <- ncol(itemprob)
}
else {
itemprob <- fit$itemprob classprob <- fit$classprob
G <- nrow(itemprob)
M <- ncol(itemprob)
}
x <- matrix(runif(n * M), nrow = n)
classvec <- as.vector(rmultinom(1, n, prob = classprob))
ind <- c(0, cumsum(classvec))
z <- rep(NA,n)
for (g in 1:G) {
if (ind[g]<ind[g+1]){
x[(ind[g] + 1):ind[g + 1], ] <- t(t(x[(ind[g] +
1):ind[g + 1], ]) < itemprob[g, ]) * 1
z[(ind[g] + 1):ind[g + 1]] <- g
}
}
make_list(x,z)
}

#' Simulate data and subject-specific indicators from tree-structured latent class models
#'
#' The observations belong to leaves that may further belong to a few groups, each
#' with its own K-class probabilities. We assume all the leaves share the same
#' set of K class-specific response probability profiles.
#'
#' @param n sample size
#' @param itemprob item probabilities; this is shared across leaf nodes; K by J
#' @param mytree see [design_tree()]
#' @param pi_mat class probabilities for  pL leaf nodes; it is pL by K.
#' @param h_pau a p-dim vector of positive values that indicates the branch lengths
#' between a node u and its parent pa(u)
#' @param balanced by default is TRUE to uniformly assign observations to the leaf nodes;
#' otherwise set this to FALSE.
#' @param ratio for a pair of leaves; the sample ratios (larger vs smaller ones);
#' in the event of an odd number of leaves; the smaller leaf in the pair is kept.
#'
#' @return a list
#' \describe{
#' \item{Y}{observations leaf by leaf}
#' \item{curr_leaves}{leaf names, need to be for each row of Y}
#' \item{truth}{a list that contains the simulation truth:
#' \itemize{
#' \item Z true class indicators for all observations
#' \item itemprob a K by J matrix of response probability profiles
#' \item pi_mat the eta_v transformed to pi_v; pL by K
#' \item h_pau a vector of p values, each representing the
#' branch length between the node u and its parent node pa(u)
#' }
#' }
#' }
#'
#' @examples
#'
#' library(igraph)
#' n = 1000
#' tau   <- c(0.6,0.3,0.1)
#' itemprob <- rbind(rep(rep(c(0.9, 0.9), each = 1),9),
#'                   rep(rep(c(0.5, 0.5), each = 1),9),
#'                   rep(rep(c(0.1, 0.1), each = 1),9))
#'
#' data("lotR_example_edges")
#' mytree <- igraph::graph_from_edgelist(lotR_example_edges, directed = TRUE)
#' # Plot tree
#'
#' nodes  <- names(igraph::V(mytree))
#' leaves <- names(igraph::V(mytree)[degree(mytree, mode = "out") == 0])
#' pL = length(leaves)
#' p  = length(igraph::V(mytree))
#'
#' ######
#' K = nrow(itemprob)
#' # specify the nodes that have non-trivial alpha_u, this was called
#' # xi_u, because xi_u = s_u*alpha_u and s_u = 1 if we set it in the simulation.
#' alpha_mat = rbind(logit(prob2stick(tau)[-K]),
#'                   c(-1,-0.5),
#'                   c(1,0.5),
#'                   matrix(0,nrow=p-3,ncol=K-1)
#' )
#'
#' # get lists of ancestors for each leaf_ids:
#' d <- igraph::diameter(mytree,weights=NA)
#' # need to set weight=NA to prevent the use of edge lengths in determining the diameter.
#' ancestors <- igraph::ego(mytree, order = d + 1, nodes = leaves, mode = "in")
#' ancestors <- sapply(ancestors, names, simplify = FALSE)
#' ancestors <- sapply(ancestors, function(a, nodes) which(nodes %in% a),
#'                     nodes = nodes, simplify = FALSE)
#' names(ancestors) <- leaves
#'
#' # calculate the class probabilities for all leaf nodes; each leaf node
#' # should have a K-dim vector that sums to one; Some nodes may share
#' # the same set of K-dim probability vector, others may differ. There are
#' # one or more groups of leaf nodes with distinct K-dim probability vectors.
#' # Note the branch lengths may also be used here.
#' pi_mat <- matrix(NA,nrow=pL,ncol=K)
#' for (v in seq_along(leaves)){
#'   pi_mat[v,-K] <- colSums(alpha_mat[ancestors[[v]],,drop=FALSE])
#'   pi_mat[v,] <- tsb(c(expit(pi_mat[v,-K]),1))
#' }
#'
#' # s = c(1, 1,1,0,0, rep(0,pL)) # effective nodes
#' h_pau = rep(1,p)
#'
#' lotR_example_data_tree <- simulate_lcm_tree(n,itemprob,mytree,pi_mat,h_pau)
#' #save the simulated data to the R package for illustration:
#' # save(lotR_example_data_tree, file = "data/lotR_example_data_tree2.rda", compress = "xz")
#' @seealso [BayesLCA::blca()]
#' @importFrom stats rmultinom runif
#' @importFrom igraph V
#' @export
simulate_lcm_tree <- function (n, itemprob, mytree, pi_mat, h_pau,balanced=TRUE,ratio=4)
{
# a few quick calculations:
K <- nrow(itemprob)
nodes  <- names(V(mytree))
leaves <- names(V(mytree)[degree(mytree, mode = "out") == 0])
pL <- length(leaves)
p  <- length(V(mytree))

# simulate number of observations for each leaf node:
N_sim <- rep(1:pL,each=2) #at least two observations per leaf node.
prob_vec <- rep(1/pL,pL)
if (!balanced){
prob_vec <- rep(c(1,ratio), c(floor(pL/2),pL-floor(pL/2))) # currently 1:4 ratio.
prob_vec <- sample(prob_vec/sum(prob_vec))
}
N_sim <- c(N_sim,sample(1:pL,size=n-2*pL,prob=prob_vec,replace=TRUE)) # even leafs.
N_sim <- as.integer(table(sort(N_sim)))

# simulate the observations leaf by leaf: Y, curr_leaves, truth (
# - true set of nodes with non-trivial increment xi_u, i.e.,
# alpha_u, the actual values of these increments, the eta_v which
# sums over all non-trivial ancestral nodes' value of alpha_u;
# - the eta_v transformed to pi_v;
# - the class-specific response probabilities)

# simulate the multivariate responses
Y_sim <- list()
# simulate the class memberships
Z_sim <- list()
curr_leaves_sim <- list()
for (v in 1:pL){
simu <- lotR_blca(N_sim[v], itemprob = itemprob, classprob = pi_mat[v,])
Y_sim[[v]] <- simu$x curr_leaves_sim[[v]] <- rep(v,N_sim[v]) Z_sim[[v]] <- simu$z
}

Y <- do.call("rbind",Y_sim)
curr_leaves <- leaves[do.call("c",curr_leaves_sim)]
Z <- do.call("c",Z_sim)

truth <- make_list(Z,itemprob, mytree, pi_mat,h_pau)
make_list(Y,curr_leaves,truth)
}

#' generate all permutations of 1 to n
#'
#' Copied from Museful's solution from here
#' <https://stackoverflow.com/questions/11095992/generating-all-distinct-permutations-of-a-list-in-r>
#' This also mimics RcppAlgos::permuteGeneral(n,n)
#'
#' @param n an integer
#'
#' @examples
#' permutations(3)
#'
#' @returns a matrix of n columns and factorial(n) rows
#' @export
permutations <- function(n){
if(n==1){
return(matrix(1))
} else {
sp <- permutations(n-1)
p <- nrow(sp)
A <- matrix(nrow=n*p,ncol=n)
for(i in 1:n){
A[(i-1)*p+1:p,] <- cbind(i,sp+(sp>=i))
}
return(A)
}
}

#' Get optimal column permutation of a matrix B to match the matrix A
#'
#' Useful in simulations where the posterior sampling relabels the classes,
#' which is equivalent and cannot be told apart by the likelihood only.
#'
#' @param A a matrix (e.g., truth)
#' @param B another matrix (e.g., estimated); A and B must have the same dimensions;
#' B's columns may be permutated to best match those of A.
#'
#' @return a permutation of columns of B (represented by a permuted 1:ncol(A))
#'
#' @examples
#'
#' A <- matrix(c(1,2,3,4,5,6),nrow=2,ncol=3)
#' B <- A[,c(2,1,3)]
#' opt_colpermB(A,B) # should expect c(2,1,3)
#' @export
opt_colpermB <- function(A,B){
if (dim(A)[1]!=dim(B)[1] && dim(A)[2]!=dim(B)[2]){
stop("[lotR] matrix of different dimensions")}
n <- ncol(A)
col_perms  <- permutations(n)
dist_mat   <- apply(col_perms, 1, function(col) sum(abs(A- B[,col,drop=FALSE])^2))
optim_cols <- which.min(dist_mat)
col_perms[optim_cols, ]
}

#' calculate root mean squared errors and biases
#'
#' used for comparing truth and estimates
#'
#' @param A a matrix (e.g., truth)
#' @param B another matrix (e.g., estimated); A and B must have the same dimensions;
#' B's columns must have already been permuted to best match those of A.
#'
#' @return a list
#' \describe{
#' \item{rmse_total}{root mean squared error for all entries}
#' \item{rmse_marg}{root mean squared error for all entries by column of A}
#' \item{frac_bias}{percent bias averaged over all entries}
#' \item{frac_bias_marg}{ percent bias by column of A}
#' }
#' @examples
#'
#' A <- matrix(c(1,2,3,4,5,6),nrow=2,ncol=3)
#' B <- A
#' rmse_bias_AB(A,B)
#' @export
rmse_bias_AB <- function(A,B){
if (dim(A)[1]!=dim(B)[1] && dim(A)[2]!=dim(B)[2]){
stop("[lotR] matrix of different dimensions")}
rmse_total <- sqrt(mean((A-B)^2))
rmse_marg <- sqrt(colMeans((A-B)^2))
frac_bias  <- mean(abs(A-B)/abs(A))
frac_bias_marg <- colMeans(abs(A-B)/abs(A))
make_list(rmse_total,rmse_marg,frac_bias,frac_bias_marg)
}

#' calculate the misclassification rates
#'
#' the first is the truth, the second is the estimated ones
#'
#' @param a truth class labels
#' @param b estimated class labels; a and b are of the same length
#'
#' @return the overall and the class-specific misclassification errors
#' @examples
#'
#' compute_misrate(c(1,1,1,2,2,2,3,3,3),c(1,1,1,1,2,2,2,2,2))
#' @export
compute_misrate <- function(a,b){
if (length(a)!=length(b)){stop("[lotR] the two vectors are of distinct lengths")}
K <- length(unique(a))
res <- matrix(NA,nrow=K,ncol=K)
for (k1 in 1:K){
for (k2 in 1:K){
res[k1,k2] <- sum(a==k1 & b==k2)
}
}
n <- length(a)
c(1-sum(res[cbind(1:K,1:K)])/n,1-(res/rowSums(res))[cbind(1:K,1:K)])
}

zhenkewu/lotR documentation built on April 24, 2022, 2:36 a.m.