#'
#' @title
#' Simulation of latent variables \eqn{Z} in the \emph{mixdpclust} model
#'
#' @description
#' Simulates values for latent variables \eqn{Z=(Z_1,...,Z_q)} according to the specification in the \emph{mixdpclust} model.
#'
#' @param Y Matrix or data frame containing the observed data.
#'
#' @param var_type Character vector that indicates the type of variable in each column of Y. Three possible types:
#' \itemize{
#' \item "\strong{c}" for continuous variables.
#' \item "\strong{o}" for ordinal variables (ordered categorical).
#' \item "\strong{m}" for nominal variables (non-ordered categorical).
#' }
#'
#' @param mu_Z an optional vector with the expected values \eqn{\mu_Z} of the latent variables.
#' @param sigma_Z an optional matrix with the covariance matrix \eqn{\Sigma_Z} of the latent variables.
#' @param Z_old an optional matrix with initial values for the latent variables that will be simulated.
#' @param USING_CPP indicates usage of C++ in some modules.
#'
#' @importFrom stats diffinv model.matrix pnorm qnorm relevel runif
#' @importFrom MASS ginv
#'
#' @details
#' For each variable in the \code{Y} data frame, an associated continuous latent variable is generated.
#' if \eqn{Y_j} is continuous, the corresponding \eqn{Z_j} will keep the original values of \eqn{Y_j}.
#' If \eqn{Y_j} is categorical, the function will scan the unique values of \eqn{Y_j} and generate continuous latent variables accordingly.
#'
#' @keywords internal
get_latents <- function( Y,
var_type,
mu_Z=NULL,
sigma_Z=NULL,
Z_old=NULL,
USING_CPP=TRUE ) {
### This function simulates the latent variables for a specified Y ###
Y <- data.frame(as.matrix(Y))
n <- nrow(Y)
p <- ncol(Y)
# possible variable classes that are allowed
var_type_all <- c("c","o","m")
# checking input consistency
if( length(var_type) != ncol(Y) ) {
cat('\nError: The number of columns in "Y" have to be equal to the lenght of vector "var_type"\n')
stop('The number of columns in "Y" have to be equal to the lenght of vector "var_type"')
}
if( any(!is.element(var_type,var_type_all)) ) {
cat('\nError: Elements in "var_type" have to be one of ',paste(var_type_all,collapse = ","),'\n')
stop('Elements in "var_type" have to be one of ',paste(var_type_all,collapse = ","))
}
# number of variables by type
n_c <- sum( is.element( var_type, var_type_all[1] ) )
n_o <- sum( is.element( var_type, var_type_all[2] ) )
n_m <- sum( is.element( var_type, var_type_all[3] ) )
p==n_c+n_o+n_m # TRUE
# Sorting Y columns
Y_new_order <- c( which(is.element(var_type,var_type_all[1])),
which(is.element(var_type,var_type_all[2])),
which(is.element(var_type,var_type_all[3])) )
var_type <- var_type[Y_new_order]
Y <- Y[,Y_new_order,drop=F]
rm(Y_new_order)
# changes the colnames of Y for simplicity and standarization
colnames(Y) <- paste("var_",
c(rep(var_type_all[1],n_c),rep(var_type_all[2],n_o),rep(var_type_all[3],n_m)),"_",
formatC( unlist(mapply(seq,1,c(n_c,n_o,n_m),length.out=c(n_c,n_o,n_m))) , width = 2, flag = '0'),
sep="")
# Ordinal variables as factor
if(n_o>0) {
for (i in which( is.element( var_type, var_type_all[2] ) ) ) {
Y[,i] <- factor(Y[,i])
}
}
# Categorical variables as factor #
if(n_m>0) {
# vector with the number of latent variables that will be used for each categorical variable
n_m_l <- rep(NA,n_m)
# vector with the number of classes for each categorical variable
K_m <- rep(NA,n_m)
for (i in 1:n_m ) {
aux_i <- which( is.element( var_type, var_type_all[3] ) )[i]
Y[,aux_i] <- factor(Y[,aux_i])
K_m[i] <- length(levels(Y[ ,aux_i])) # number of categories in that categorical variable
n_m_l[i] <- K_m[i] - 1 # how many latents are needed for each categorical?
}
} else {
n_m_l<-as.numeric(NULL)
}
# Compute the total number of latents that will be needed #
n_q <- n_c+n_o+sum(n_m_l)
### Latent Variables ###
Z <- matrix(data=NA,nrow=nrow(Y),ncol=n_q)
# colnames of Z #
colnames_Z <- NULL
# colnames of Z: continuous #
if( n_c > 0 ) {
colnames_Z <- c( colnames_Z, paste("var_",
rep(var_type_all[1],n_c),"_",
formatC( 1:n_c, width = 2, flag = '0'), sep="") )
}
# colnames of Z: ordinal #
if( n_o > 0 ) {
colnames_Z <- c( colnames_Z, paste("var_",
rep(var_type_all[2],n_o),"_",
formatC( 1:n_o, width = 2, flag = '0'), sep="") )
}
# colnames of Z: categorical #
if( n_m > 0 ) {
for(i in 1:n_m) {
colnames_Z <- c(colnames_Z, paste("var_",
c( rep(var_type_all[3],n_m_l[i] ) ),"_",
formatC( i, width = 2, flag = '0'),"_",
formatC( 1:n_m_l[i] , width = 2, flag = '0'),
sep=""))
}
}
colnames(Z) <- colnames_Z
##### Simulating Latent values #####
if( is.null(mu_Z) ) {
mu_Z <- matrix(0,nrow=n,ncol=n_q)
colnames(mu_Z) <- colnames_Z
}
if( is.null(sigma_Z) ) {
sigma_Z <- diag(1,nrow=n_q,ncol=n_q)
colnames(sigma_Z) <- rownames(sigma_Z) <- colnames_Z
}
### Latent: Continuous ###
if(n_c>0) {
# cat('Simulating latents for ',n_c,' Continuous variables ')
if( is.null(Z_old) ) {
Z[,1:n_c] <- as.matrix( Y[,1:n_c] )
# standardize the continuous variables
#Z[,1:n_c] <- as.matrix( scale(Y[,1:n_c]) , nrow=nrow(Y), ncol=n_c )
} else {
Z[,1:n_c] <- Z_old[,1:n_c]
}
}
### Latent: Ordinal ###
if(n_o>0) {
#cat('Simulating latents for ',n_o,' Ordinal variables...')
thres_o <- list() # list with the vector of thresholds for each ordinal variable
K_o <- as.numeric(rep(NA,n_o)) # number of categories in each Ordinal variable
# getting number of categories #
for ( j in 1:n_o ) {
# j<-1
# cat(j,',')
Y_ord_j <- which( is.element(var_type,var_type_all[2])) # what columns in Y are ordinal?
Y_ord_j <- Y_ord_j[j] # choose the ith ordinal
# number of categories in that ordinal variable
K_o[j] <- length(levels(Y[,Y_ord_j]))
# obtain the thresholds dividing (-Inf,Inf) into K_o[i] intervals
# recommended thresholds of length 4 with a fixed variance of 1
thres_o[[j]] <- c(-Inf,seq(from=-4*floor((K_o[j]-2)/2),to=4*ceiling((K_o[j]-2)/2),by=4),Inf)
}
# simulating ordinal latents #
for(i in 1:n) {
# i<-1
for(j in 1:n_o) {
# j<-1
cat_ij <- match( Y[i,n_c+j],levels(Y[,n_c+j]) )
if( is.null(Z_old) ) {
if(F) {
# simplified
mu_Z_aux <- mu_Z[i,n_c+j]
sigma_Z_aux <- sigma_Z[n_c+j,n_c+j]
sigma_Z_aux <- sigma_Z_aux
} else {
# the point will come from a distribution such that
# 95% of probability is within the interval
# mu_Z_ij centered in its interval
mu_Z_aux <- thres_o[[j]][cat_ij+c(0,1)]
if(mu_Z_aux[1]==-Inf){mu_Z_aux[1]<-mu_Z_aux[2]-2*qnorm(0.95)}
if(mu_Z_aux[2]==Inf){mu_Z_aux[2]<-mu_Z_aux[1]+2*qnorm(0.95)}
mu_Z_aux <- mean(mu_Z_aux)
mu_Z[i,n_c+j] <- mu_Z_aux
# the variance is set such that it has 95% of being at that interval
sigma_Z_aux <- thres_o[[j]][cat_ij+c(0,1)]
if(any(abs(sigma_Z_aux)==Inf)) {
sigma_Z_aux <- 1
} else {
sigma_Z_aux <- diff(sigma_Z_aux) / (2*qnorm(0.975))
}
# 95% of probability within the interval
# pnorm(q=thres_o[[j]][cat_ij+c(0,1)],mean=mu_Z_aux,sd=sigma_Z_aux)
}
} else {
# if an initial value for Z was given
mu_Z_aux <- mu_Z[i,n_c+j,drop=F]
sigma_Z_aux <- sigma_Z[n_c+j,n_c+j,drop=F]
if(all(dim(sigma_Z)>1)) {
mu_Z_aux <- mu_Z_aux + sigma_Z[(n_c+j),-(n_c+j),drop=F] %*% MASS::ginv(sigma_Z[-(n_c+j),-(n_c+j),drop=F]) %*% t( Z_old[i,-(n_c+j),drop=F] - mu_Z[i,-(n_c+j),drop=F] )
sigma_Z_aux <- sigma_Z_aux + sigma_Z[(n_c+j),-(n_c+j),drop=F] %*% MASS::ginv(sigma_Z[-(n_c+j),-(n_c+j),drop=F]) %*% sigma_Z[-(n_c+j),(n_c+j),drop=F]
}
}
if(USING_CPP) {
Z[i,n_c+j] <- rtn1( mean=mu_Z_aux,
sd=sqrt(sigma_Z_aux),
low=thres_o[[j]][cat_ij],
high=thres_o[[j]][cat_ij+1] )
} else {
Z[i,n_c+j] <- truncnorm::rtruncnorm( n=1,
a=thres_o[[j]][cat_ij], b=thres_o[[j]][cat_ij+1],
mean=mu_Z_aux,
sd=sqrt(sigma_Z_aux) )
}
if( !(Z[i,n_c+j] > thres_o[[j]][cat_ij] & Z[i,n_c+j] < thres_o[[j]][cat_ij+1]) ){
cat('\nError: There was a problem simulating an ordinal latent Z_ij, i=',i,', j=',n_c+j,'\n')
stop('There was a problem simulating an ordinal latent Z_ij, i=',i,', j=',n_c+j,sep="")
}
}
}
#cat(' done!\n')
} else {
thres_o<-as.numeric(NULL)
K_o<-as.numeric(NULL)
}
### Latent: Categorical ###
if(n_m>0) {
#cat('Simulating latents for ',n_m,' Categorical variables...')
for(j in 1:n_m) {
#cat(j,', ')
# attention: j goes one for each categorical, not for each latent!
# j<-2
# defines dummies of Y_cat
Y_j <- as.factor(Y[,n_c+n_o+j])
Y_j <- relevel(Y_j,ref=length(levels(Y_j))) # assign the reference level as the last level
Y_j <- model.matrix( ~.,data = data.frame( Y_j )) # tranform the categorical to dummy variables
Y_j <- as.matrix( Y_j[,-1] )
#head(Y_j)
#head(Y[,n_c+n_o+j])
for ( i in 1:n ) {
# i<-2
# simulating one by one
if( any( Y_j[i,]>0 ) ) { # observation i is different from the last class, at least one latent >0 and the max corresponds to the value of the categorical
# first, simulates the maximum #
j_z_max <- n_c + n_o + diffinv(n_m_l)[j] + which(Y_j[i,]==1) # column of z corresponding with the maximum
if( is.null(Z_old) ) {
mu_Z_aux <- mu_Z[i,j_z_max]
sigma_Z_aux <- sigma_Z[j_z_max,j_z_max]
sigma_Z_aux <- sigma_Z_aux
} else {
mu_Z_aux <- mu_Z[i,j_z_max] + sigma_Z[j_z_max,-j_z_max] %*% ginv(sigma_Z[-j_z_max,-j_z_max]) %*% ( Z_old[i,-j_z_max] - mu_Z[i,-j_z_max] )
sigma_Z_aux <- sigma_Z[j_z_max,j_z_max] + sigma_Z[j_z_max,-j_z_max] %*% ginv(sigma_Z[-j_z_max,-j_z_max]) %*% sigma_Z[-j_z_max,j_z_max]
sigma_Z_aux <- sigma_Z_aux
}
sim_int <- c(pnorm(0,mean=mu_Z_aux,sd=sqrt(sigma_Z_aux)),1) # simulates a positive gaussian number for the maximum
Z[i, j_z_max ] <- qnorm( p=runif(n=1, min=sim_int[1], max=sim_int[2]) , mean=mu_Z_aux, sd=sqrt(sigma_Z_aux))
# then, simulates the rest #
for( s in (1:ncol(Y_j))[-which(Y_j[i,]==1)] ) {
j_z_s <- n_c + n_o + diffinv(n_m_l)[j] + s # column of z corresponding with this value
if( is.null(Z_old) ) {
mu_Z_aux <- mu_Z[i,j_z_s]
sigma_Z_aux <- sigma_Z[j_z_s,j_z_s]
sigma_Z_aux <- sigma_Z_aux
} else {
mu_Z_aux <- mu_Z[i,j_z_s] + sigma_Z[j_z_s,-j_z_s] %*% ginv(sigma_Z[-j_z_s,-j_z_s]) %*% ( Z_old[i,-j_z_s] - mu_Z[i,-j_z_s] )
sigma_Z_aux <- sigma_Z[j_z_s,j_z_s] + sigma_Z[j_z_s,-j_z_s] %*% ginv(sigma_Z[-j_z_s,-j_z_s]) %*% sigma_Z[-j_z_s,j_z_s]
sigma_Z_aux <- sigma_Z_aux
}
sim_int <- c(0,pnorm( Z[i, j_z_max ] ,mean=mu_Z_aux,sd=sqrt(sigma_Z_aux) )) # simulates a gaussian number lower than the maximum
Z[i, j_z_s ] <- qnorm( p=runif(n=1, min=sim_int[1], max=sim_int[2]) , mean=mu_Z_aux, sd=sqrt(sigma_Z_aux))
}
# checking consistency #
if( which(Y_j[i,]==1) != which(Z[i,n_c + n_o + diffinv(n_m_l)[j]+1:ncol(Y_j)]==max(Z[i,n_c + n_o + diffinv(n_m_l)[j]+1:ncol(Y_j)])) ) {
cat('\nError: There is a problem generating a categorical variable i=',i,' j=',j,'\n')
stop('There is a problem generating a categorical variable i=',i,' j=',j,sep='')
}
} else { # observation i is equal to the last class, all values of latents have to be negative
for( s in 1:ncol(Y_j) ) {
j_z_s <- n_c + n_o + diffinv(n_m_l)[j] + s # column of z corresponding with this value
if( is.null(Z_old) ) {
mu_Z_aux <- mu_Z[i,j_z_s]
sigma_Z_aux <- sigma_Z[j_z_s,j_z_s]
sigma_Z_aux <- sigma_Z_aux
} else {
mu_Z_aux <- mu_Z[i,j_z_s] + sigma_Z[j_z_s,-j_z_s] %*% ginv(sigma_Z[-j_z_s,-j_z_s]) %*% ( Z_old[i,-j_z_s] - mu_Z[i,-j_z_s] )
sigma_Z_aux <- sigma_Z[j_z_s,j_z_s] + sigma_Z[j_z_s,-j_z_s] %*% ginv(sigma_Z[-j_z_s,-j_z_s]) %*% sigma_Z[-j_z_s,j_z_s]
sigma_Z_aux <- sigma_Z_aux
}
sim_int <- c(0,pnorm(0,mean=mu_Z_aux,sd=sigma_Z_aux)) # simulates negative gaussian numbers
Z[i, j_z_s ] <- qnorm( p=runif(n=1, min=sim_int[1], max=sim_int[2]) , mean=mu_Z_aux, sd=sigma_Z_aux)
}
# checking consistency #
all(Z[i,n_c + n_o + diffinv(n_m_l)[j]+1:ncol(Y_j)]<0)
}
}
}
#cat('...done!\n')
}
dim(Z)[1]==n # TRUE
dim(Z)[2]==n_q # TRUE
return( list( Z=Z,
n_c=n_c,
n_o=n_o,
n_m_l=n_m_l,
n_q=n_q,
thres_o=thres_o,
K_o=K_o) )
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.