R/src_hdpGLM_simulate.R

Defines functions dpGLM_simulateData_main

## =====================================================
## dpGLM
## =====================================================
dpGLM_simulateData_main <- function(n, K, nCov, nCovj=NULL, parameters=NULL,
                                    pi=NULL, family, seed=sample(1:777,1))
{
    ## error handling
    if (n %% 2) stop("Sample size n must be an even number")
    if(! family %in% c('gaussian', 'binomial', 'multivariate')) stop(paste0('Error: family must be one element of the set : {', paste0(c('gaussian', 'binomial', 'multinomial'), collapse=','),'}'))
    if (!is.null(parameters)){
        ## check the format of parameters
        if(! all(names(parameters) %in% c('pi', 'beta') | names(parameters) %in% c('pi', 'beta', 'K')) | is.null(names(parameters)) ) stop('\n\n\"parameters\" must be a named list. The names must be \'beta\' and \'pi\'.\n ')
        if(! length(parameters$beta) == K) stop(paste0('\n\nThe element \'beta\' of the list \'parameters\' must be a list itself with size ', K, '. Each element of that list must be a vector of ', nCov+1, ' linear parameter(s).' ,  sep=''))
        if(! length(parameters$pi) == K) stop(paste0('\n\nThe element \'pi\' of the list \'parameters\' must be a vector with size ', K,'.' ,  sep=''))
        if(! sum(parameters$pi) == 1) stop ('\n\nThe element \'pi\' of the list \'parameters\' must add up to 1.')
        if(! all(parameters$beta %>% purrr::map(., ~length(.)) %>% unlist == nCov +1)) stop(paste0('\n\nAll elements of the list \'beta\' of the list \'parameters\' must have length ', nCov+1, sep=''))
    }

    if (is.null(parameters)) {
        parameters <- dpGLM_simulateParameters(K=K, nCov=nCov, pi, seed=seed) 
    }
    if(family=='gaussian'){sim_data = .dpGLM_simulateData_gaussian(n=n, K=K, nCov=nCov, parameters, seed=seed)}
    if(family=='binomial'){sim_data = .dpGLM_simulateData_binomial(n=n, K=K, nCov=nCov, parameters, seed=seed)}

    class(sim_data) = "dpGLM_data"
    return(sim_data)
}
dpGLM_simulateParameters <- function(nCov, nCovj=NULL, K=NULL, pi=NULL,
                                     seed=NULL)
{
## ' Simulate the parameters that can be used to simulate data sets from the dpGLM
## '
## ' This function generates parameters that can be used to
## ' simulate data sets from the Hierarchical
## ' Dirichlet Process of Generalized Linear Model (dpGLM)
## '
## ' @param nCov integer, the number of covariates of the GLM components
## ' @param nCovj an integer indicating the number of covariates determining the average parameter of the base measure of the Dirichlet process prior
## ' @param K integer, the number of clusters. If there are multiple contexts, K is the average number of clusters across contexts, and each context gets a number of clusters sampled from a Poisson distribution, except if \code{same.K} is \code{TRUE}.
## ' @param pi either NULL or a vector with length K that add up to 1. If not NULL, it determines the mixture probabilities
## ' @param seed a seed for \code{set.seed}
## '
## ' @return The function returns a list with the parameters
## '         used to generate data sets from the dpGLM model.
## '         This list can be used in the function \code{hdpGLM_simulateData}
## ' 
## ' @examples
## ' parameters = dpGLM_simulateParameters(nCov=2, K=2) 
## '
## ' @export
    if(is.null(seed)) seed <- sample(1:777,1)
    parameters <- list(pi=NA ,beta = list(), K=K)

    ## pi
    ## --
    if(is.null(pi)){
        parameters$pi <- rep(1/K,K)
    }else{
        if(length(pi) != K) stop('\n\n Size of pi must be equal to K\n')
        parameters$pi <- pi
    }

    ## beta
    ## ----
    parameters$beta <- lapply(rep(nCov, K), function(nCov) stats::runif(n=nCov+1, -10,10))
    parameters$beta <- parameters$beta %>% purrr::map(., ~stats::setNames(., paste0('beta', 1:(nCov+1), sep='')))

    return(parameters)
}
.dpGLM_simulateData_gaussian <- function(n, K, nCov, parameters, seed)
{
    set.seed(seed)
    n = floor(n*parameters$pi)

    beta  = parameters$beta
    X     = list()
    y     = list()
    for (k in 1:K){
        epsilon   <- stats::rnorm(n=n[k], 0, 1)

        if(nCov!=0){
            X[[k]] <- MASS::mvrnorm(n=n[k], mu = rep(0,nCov), Sigma = diag(nCov))
            y[[k]] <- base::cbind(1,X[[k]]) %*% beta[[k]] + epsilon
        }else{
            X[[k]] <- NULL
            y[[k]] <- beta[[k]] + epsilon
        }
    }
    if(nCov!=0){
        data   = stats::setNames(data.frame(y = unlist(y), X = data.frame(do.call(base::rbind, X))), nm=c('y', paste0('X',1:nCov)))
    }else{
        data   = data.frame(y = unlist(y))
    }
    return ( list(data=data, Z = rep(1:K,n), parameters=parameters) )
}
.dpGLM_simulateData_binomial <- function(n, K, nCov, parameters,  seed)
{
    set.seed(seed)
    n = floor(n*parameters$pi)

    beta  = parameters$beta
    X     = list()
    y     = list()
    p     = list()
    nu    = list()
    for (k in 1:K){
        epsilon  <- stats::rnorm(n=n[k], 0, 1)

        if(nCov!=0){
            X[[k]]   <- MASS::mvrnorm(n=n[k], mu = rep(0,nCov), Sigma = diag(nCov))
            nu[[k]]  <- base::cbind(1,X[[k]]) %*% beta[[k]]
            p[[k]]   <- 1/(1+exp(-nu[[k]]))
        }else{
            X[[k]]   <- NULL
            nu[[k]]  <- rep(beta[[k]], n[k]) 
            p[[k]]   <- 1/(1+exp(-nu[[k]]))
        }
        y[[k]] = rep(NA, n[k])
        for (i in 1:n[k]) y[[k]][i] = stats::rbinom(n=1, size=1, prob=p[[k]][i])
    }
    if(nCov!=0){
        data   = stats::setNames(data.frame(y = unlist(y), X = data.frame(do.call(base::rbind, X))), nm=c('y', paste0('X',1:nCov)))
    }else{
        data   = data.frame(y = unlist(y))
    }
    return ( list(data=data, Z = rep(1:K,n), parameters=parameters) )
}
## to be completed
.dpGLM_simulateData_multinomial <- function(n, K=2, nCov=0, nCat, pi=pi,
                                            family, seed)
{
    
    set.seed(seed)
    ## rand('state', state);
    ## randn('state', state);

    # n is the total sample size (must be an even number)
    ## J = J = nClass = 4; # dep var  : number of categories of y: y in {1,2,3,4}
    ## D = nCov = l = nVar = 5;   # covars   : dimension of X (does not include the intercept !)
    ## K = 2               # clusters : pre fixed number of components (clusters)
    ## pi = rep(1/K,K)
    ## n = 2*5000
    n = n*pi
    J = nCat

    alpha = list()
    beta  = list()
    tau   = list() 
    nu    = list() 
    mu    = list()
    sd    = list()
    X     = list()
    y     = list()
    eta   = list()
    p     = list()
    for (k in 1:K){
        ## hyperpriors on G_o (for tau and nu)
        tau[[k]]  <-  sqrt( exp( stats::rnorm(n=1, mean=0, sd=.1)) );    ## ln(tau^2) ~ N(0,.1^2)
        nu[[k]]   <-  sqrt( exp( stats::rnorm(n=1, mean=0, sd= 2)) );     ## ln(nu^2)  ~ N(0,2^2)

        ## Priors for theta defined by G_o
        mu[[k]]     <- stats::rnorm(n=nCov, mean=0, sd=1);                    ## mu          ~ N_d(0,1)   which is expect of X ~ N_d(mu, sigma^2 * I)
        sd[[k]]     <- sqrt( exp( stats::rnorm(n=nCov, mean=0, sd=2) ) );  ## ln(sigma^2) ~ N_d(0,2^2) which is var    or X ~ N_d(mu, sigma^2 * I)
        alpha[[k]]  <- stats::rnorm(n=J,mean=0, sd=tau[[k]] )
        beta[[k]]   <- MASS::mvrnorm(n=nCov,mu=rep(0,J), Sigma=nu[[k]]^2  * diag(J))

        ## generating X's
        ## --------------
        X[[k]]  <-  MASS::mvrnorm(n=n[k], mu = mu[[k]], Sigma = sd[[k]] * diag(nCov))

        ## computing y, eta = alpha + X^T beta, and p = exp(eta)/sum(exp(eta))
        ## -------------------------------------------------------------------
        eta[[k]]   <-  base::cbind(1, X[[k]]) %*% base::rbind(alpha[[k]], beta[[k]])
        eta[[k]]   <-  eta[[k]] + MASS::mvrnorm(n[k], mu=rep(0,J), Sigma = .05 * diag(J))
        p[[k]]     <-  t(apply(eta[[k]], 1, function(eta.kj) exp(eta.kj) / sum(exp(eta.kj)) ))
        y[[k]] <- rep(NA, n[k])
        for (i in 1:n[k]) y[[k]][i]  = which(stats::rmultinom(n=1, size=1, prob=p[[k]][i,]) == 1)

    }
    return ( list(y = unlist(y),
                  X = stats::setNames(data.frame(do.call(base::rbind, X)), nm=paste0('X',1:nCov) ),
                  Z = rep(1:K,n)) )
}


## =====================================================
## hdpGLM
## =====================================================
hdpGLM_simulateData_main <- function(n, K, nCov, nCovj=NULL, J, parameters=NULL,
                                     pi=NULL, family, same.K,
                                     seed=sample(1:777,1), context.effect,
                                     same.clusters.across.contexts,
                                     context.dependent.cluster)
{
    ## error handling ----------------------------------------
    if (n %% 2) stop("Sample size n must be an even number")
    if(! family %in% c('gaussian', 'binomial', 'multivariate')) stop(paste0('Error: family must be one element of the set : {', paste0(c('gaussian', 'binomial', 'multinomial'), collapse=','),'}'))
    if (!is.null(parameters)){
        ## check the format of parameters
        ## if(! all(names(parameters) %in% c('pi', 'beta') | names(parameters) %in% c('pi', 'beta', 'K', 'W')) | is.null(names(parameters)) ) stop('\n\n\"parameters\" must be a named list. The names must be \'beta\' and \'pi\'.\n ')
        if(! length(parameters$beta) == K) stop(paste0('\n\nThe element \'beta\' of the list \'parameters\' must be a list itself with size ', K, '. Each element of that list must be a vector of ', nCov+1, ' linear parameter(s).' ,  sep=''))
        if(! length(parameters$pi) == K) stop(paste0('\n\nThe element \'pi\' of the list \'parameters\' must be a vector with size ', K,'.' ,  sep=''))
        if(! sum(parameters$pi) == 1) stop ('\n\nThe element \'pi\' of the list \'parameters\' must add up to 1.')
        if(! all(parameters$beta %>% purrr::map(., ~length(.)) %>% unlist == nCov +1)) stop(paste0('\n\nAll elements of the list \'beta\' of the list \'parameters\' must have length ', nCov+1, sep=''))
    }
    ## -----------------------------------------------------


    if (is.null(parameters)) {parameters <- hdpGLM_simulateParameters(K=K, nCov=nCov, nCovj=nCovj, J=J, pi, same.K, seed=seed, 
                                                                      context.effect=context.effect,same.clusters.across.contexts=same.clusters.across.contexts, context.dependent.cluster=context.dependent.cluster)}
    if(family=='gaussian')   {sim_data = .hdpGLM_simulateData_gaussian(n=n, K=K, nCov=nCov, nCovj=nCovj, parameters, seed=seed)}
    if(family=='binomial')   {sim_data = .hdpGLM_simulateData_binomial(n=n, K=K, nCov=nCov, nCovj=nCovj, parameters, seed=seed)}

    class(sim_data) = "hdpGLM_data"
    return(sim_data)
}

hdpGLM_simulateParameters <- function(nCov, K=NULL, nCovj=NULL, J=NULL, pi=NULL,
                                      same.K, seed=NULL, context.effect=NULL,
                                      same.clusters.across.contexts,
                                      context.dependent.cluster)
{
    Dw = nCovj
    Dx = nCov
    if(is.null(seed)) seed <- base::sample(1:777,1)

    ## K
    ## -
    ## if(!same.K){
    ##     Ks = stats::rpois(J, lambda=K)
    ##     Ks[Ks==0] = 1
    ## }else{
    ##     Ks = rep(K, J)
    ## }

    parameters <- list(pi=NA ,beta = list(), K=K)

    ## pi
    ## --
    if(is.null(pi)){
        parameters$pi <- rep(1/K,K)
    }else{
        if(length(pi) != K) stop('\n\n Size of pi must be equal to K\n')
        parameters$pi <- pi
    }

    ## tau
    ## ---
    if(Dw ==0) {
        parameters$tau = as.matrix(0)  
        W              = as.matrix(1)  
    }
    if(Dw  >0) {
        ## user can specify specific betas to be affected by some specific context-level covars only, as well as which context level covar will affect that beta (instead of all context-level covars)
        if(!is.null(context.effect)){
            n.of.non.zero.taus = length(context.effect$on.betas) + length(context.effect$by.Ws)
            ## all taus are zero...
            parameters$tau = matrix(0, nrow=Dw+1, ncol=Dx+1)
            ## ... but the intercept and the coefficients tau of those W's that the user wants to affect beta
            parameters$tau[c(1, context.effect$by.Ws+1), context.effect$on.betas+1] = as.matrix(MASS::mvrnorm(n=1 , mu=rep(0,n.of.non.zero.taus), Sigma=5*diag(n.of.non.zero.taus)))  
        }else{
            ## if they don't all taus are different from zero and all W's affect all betas
            parameters$tau = as.matrix(MASS::mvrnorm(n=Dw+1, mu=rep(0, Dx+1), Sigma=5*diag(Dx+1)))  
        }
        Wprime         = matrix(MASS::mvrnorm(n=J, mu=rep(0, Dw) , Sigma=5*diag(Dw)), nrow=J, ncol=Dw)
        W              = cbind(1,Wprime)
        parameters$W   = W
    }
    colnames(parameters$W)   = paste0("W", 0:Dw, sep='')
    colnames(parameters$tau) = paste0("beta", 0:Dx, sep='')
    rownames(parameters$tau) = paste0("w", 0:Dw, sep='')
    
    ## beta
    ## ----
    parameters$beta <- lapply(1:J, function(J) rep(list(rep(NA, Dx+1)),K))
    tau = parameters$tau
    for (j in 1:J) {
        for (k in 1:K) {
            parameters$beta[[j]][[k]] = MASS::mvrnorm(n=1, mu=W[j, ] %*% tau, diag(nCov+1))
            names(parameters$beta[[j]][[k]] ) = paste0('beta', 1:(nCov+1))
        }
        names(parameters$beta[[j]]) = paste0('k=', 1:K, sep='')
        
    }
    if(!is.null(same.clusters.across.contexts)){
        beta = parameters$beta[[1]]
        for (j in 1:J) {
            for (k in 1:K) {
                if(!is.null(context.effect)){
                    if (context.dependent.cluster == 0 | k != context.dependent.cluster) {
                        parameters$beta[[j]][[k]][-(context.effect[1]+1)] = beta[[k]][-(context.effect[1]+1)]
                    }else{
                        parameters$beta[[j]][[k]] = beta[[k]]
                    }
                }else{
                    parameters$beta[[j]][[k]] = beta[[k]]
                }
            }
        }
    }
    names(parameters$beta) = paste0('j=', 1:J, sep='')

    return(parameters)
}

.hdpGLM_simulateData_gaussian <- function(n, K, nCov, nCovj=NULL, parameters,
                                          seed)
{
    set.seed(seed)
    n = floor(n*parameters$pi)

    W     = parameters$W
    J     = nrow(W)
    betas = parameters$beta
    X     = matrix(NA, ncol=nCov, nrow=0) # +1 to C_i
    Z     = c()
    C     = c()
    y     = c()
    for (j in 1:J)
    {
        for (k in 1:K){
            epsilon   <- stats::rnorm(n=n[k], 0, 1)

            if(nCov!=0){
                X_tmp <- MASS::mvrnorm(n=n[k], mu = rep(0,nCov),
                                       Sigma = diag(nCov))
                y_tmp <- base::cbind(1,X_tmp) %*% betas[[j]][[k]] + epsilon
            }else{
                X_tmp <- NULL
                y_tmp <- rep(1, n[k]) * betas[[j]][[k]] + epsilon
            }
            if(!is.null(X_tmp)) X = base::rbind(X, X_tmp)
            Z = c(Z, rep(k, n[k]))
            C = c(C, rep(j, n[k]))
            y = c(y, y_tmp)
        }
    }
    if(nCov!=0){
        data = data.frame(y=y, X=X, C=C) %>%
            stats::setNames(., nm = c('y', paste0('X', 1:nCov, sep=''), 'C')) %>%
            dplyr::full_join(., parameters$W %>%
                                tibble::as_tibble()  %>% 
                                dplyr::mutate(C=1:J), by='C' )
    }else{
        data  = data.frame(y = y, C=C)
    }
    return ( list(data=data, Z = Z, parameters=parameters) )
}
.hdpGLM_simulateData_binomial <- function(n, K, nCov, nCovj=NULL, parameters,
                                          seed)
{
    set.seed(seed)
    n = floor(n*parameters$pi)
    
    W     = parameters$W
    J     = nrow(W)
    betas = parameters$beta
    X     = matrix(NA, ncol=nCov, nrow=0) # +1 to C_i
    Z     = c()
    C     = c()
    y     = c()
    for (j in 1:J)
    {
        for (k in 1:K){
            epsilon  <- stats::rnorm(n=n[k], 0, 1)
            
            if(nCov!=0){
                X_tmp  <- MASS::mvrnorm(n=n[k], mu = rep(0,nCov),
                                        Sigma = diag(nCov))
                nu     <- base::cbind(1,X_tmp) %*% betas[[j]][[k]]
                p_tmp  <- 1/(1+exp(-nu))
            }else{
                X_tmp  <- NULL
                nu     <- rep(betas[[j]][[k]], n[k]) 
                p_tmp  <- 1/(1+exp(-nu))
            }
            y_tmp = rep(NA, n[k])
            for (i in 1:n[k]) y_tmp[i] = stats::rbinom(n=1, size=1,
                                                       prob=p_tmp[i])

            if(!is.null(X_tmp)) X = base::rbind(X, X_tmp)
            Z = c(Z, rep(k, n[k]))
            C = c(C, rep(j, n[k]))
            y = c(y, y_tmp)
        }
    }
    if(nCov!=0){
        data = data.frame(y=y, X=X, C=C) %>%
            stats::setNames(., nm = c('y', paste0('X', 1:nCov, sep=''), 'C')) %>%
            dplyr::full_join(., parameters$W %>%
                                tibble::as_tibble()  %>% 
                                dplyr::mutate(C=1:J), by='C' )
    }else{
        data  = data.frame(y = y, C=C)
    }
    return ( list(data=data, Z = Z, parameters=parameters) )
}

## to be completed
.hdpGLM_simulateData_multinomial <- function(n, K=2, nCov=0, nCat, pi=pi,
                                             family, seed)
{
    
    set.seed(seed)
    ## rand('state', state);
    ## randn('state', state);

    # n is the total sample size (must be an even number)
    ## J = J = nClass = 4; # dep var  : number of categories of y: y in {1,2,3,4}
    ## D = nCov = l = nVar = 5;   # covars   : dimension of X (does not include the intercept !)
    ## K = 2               # clusters : pre fixed number of components (clusters)
    ## pi = rep(1/K,K)
    ## n = 2*5000
    n = n*pi
    J = nCat

    alpha = list()
    beta  = list()
    tau   = list() 
    nu    = list() 
    mu    = list()
    sd    = list()
    X     = list()
    y     = list()
    eta   = list()
    p     = list()
    for (k in 1:K){
        ## hyperpriors on G_o (for tau and nu)
        tau[[k]]  <-  sqrt( exp( stats::rnorm(n=1, mean=0, sd=.1)) );    ## ln(tau^2) ~ N(0,.1^2)
        nu[[k]]   <-  sqrt( exp( stats::rnorm(n=1, mean=0, sd= 2)) );     ## ln(nu^2)  ~ N(0,2^2)

        ## Priors for theta defined by G_o
        mu[[k]]     <- stats::rnorm(n=nCov, mean=0, sd=1);                    ## mu          ~ N_d(0,1)   which is expect of X ~ N_d(mu, sigma^2 * I)
        sd[[k]]     <- sqrt( exp( stats::rnorm(n=nCov, mean=0, sd=2) ) );  ## ln(sigma^2) ~ N_d(0,2^2) which is var    or X ~ N_d(mu, sigma^2 * I)
        alpha[[k]]  <- stats::rnorm(n=J,mean=0, sd=tau[[k]] )
        beta[[k]]   <- MASS::mvrnorm(n=nCov,mu=rep(0,J), Sigma=nu[[k]]^2  * diag(J))

        ## generating X's
        ## --------------
        X[[k]]  <-  MASS::mvrnorm(n=n[k], mu = mu[[k]], Sigma = sd[[k]] * diag(nCov))

        ## computing y, eta = alpha + X^T beta, and p = exp(eta)/sum(exp(eta))
        ## -------------------------------------------------------------------
        eta[[k]]   <-  base::cbind(1, X[[k]]) %*% base::rbind(alpha[[k]], beta[[k]])
        eta[[k]]   <-  eta[[k]] + MASS::mvrnorm(n[k], mu=rep(0,J), Sigma = .05 * diag(J))
        p[[k]]     <-  t(apply(eta[[k]], 1, function(eta.kj) exp(eta.kj) / sum(exp(eta.kj)) ))
        y[[k]] <- rep(NA, n[k])
        for (i in 1:n[k]) y[[k]][i]  = which(stats::rmultinom(n=1, size=1, prob=p[[k]][i,]) == 1)

    }
    return ( list(y = unlist(y),
                  X = stats::setNames(data.frame(do.call(base::rbind, X)), nm=paste0('X',1:nCov) ),
                  Z = rep(1:K,n)) )
}



## {{{ docs }}}

#'
#' Simulate a Data Set from hdpGLM
#'
#' @param n integer, the sample size of the data. If there are multiple contexts, each context will have n cases.
#' @param K integer, the number of clusters. If there are multiple contexts, K is the average number of clusters across contexts, and each context gets a number of clusters sampled from a Poisson distribution, except if \code{same.K} is \code{TRUE}.
#' @param nCov integer, the number of covariates of the GLM components.
#' @param nCovj an integer indicating the number of covariates determining the average parameter of the base measure of the Dirichlet process prior
#' @param J an integer representing the number of contexts  @param parameters either NULL or a list with the parameters  to generate the model. If not NULL, it must  contain a sublist name beta, a vector named tau,  and a vector named pi. The sublist beta must be  a list of vectors, each one with size nCov+1  to be the coefficients of the GLM mixtures  components that will generate the data.  For the vector tau, if nCovj=0 (single-context case)  then it must be a 1x1 matrix containing 1.  If nCovj>0, it must be a (nCov+1)x(nCovj+1) matrix.  The vector pi must add up to 1 and have length K.
#' @param family a character with either 'gaussian', 'binomial', or 'multinomial'. It indicates the family of the GLM components of the mixture model.
#' @param parameters a list with the parameter values of the model. Format should be the same of the output of the function hdpGLM_simulateParameters()
#' @param pi either NULL or a vector with length K that add up to 1. If not NULL, it determines the mixture probabilities
#' @param same.K boolean, used when data is sampled from more than one context. If \code{TRUE} all contexts get the same number of clusters. If \code{FALSE}, each context gets a number of clusters sampled from a Poisson distribution with expectation equals to \code{K} (current not implemented)
#' @param seed a seed for \code{set.seed}
#' @param context.effect either \code{NULL} or a two dimensional integer vector. If it is \code{NULL}, all the coefficients (\code{beta}) of the individual level covariates are functions of context-level features (\code{tau}). If it is not \code{NULL}, the first component of the vector indicates the index of the lower level covariate (\code{X}) whose linear effect \code{beta} depends on context (\code{tau}) (0 is the intercept). The second component indicates the index context-level covariate (\code{W}) whose linear coefficient (\code{tau}) is non-zero.
#' @param same.clusters.across.contexts boolean, if \code{TRUE} all the contexts will have the same number of clusters AND each cluster will have the same coefficient \code{beta}.
#' @param context.dependent.cluster integer, indicates which cluster will be context-dependent. If \code{zero}, all clusters will be context-dependent 
#'
#' @export

## }}}
hdpGLM_simulateData <- function(n, K, nCov=2, nCovj=0, J=1, family='gaussian',
                                parameters=NULL, pi=NULL, same.K=FALSE ,
                                seed=NULL , context.effect=NULL ,
                                same.clusters.across.contexts=NULL,
                                context.dependent.cluster=NULL)
{
    

    if (is.null(seed)) {
        seed=sample(1:777,1)
    }
    if (is.null(context.dependent.cluster)) {
        context.dependent.cluster=0
    }
    if(nCovj==0 | J < 2) 
        dat = dpGLM_simulateData_main(n, K, nCov, nCovj=NULL,
                                      parameters=parameters, pi=pi,
                                      family=family, seed=seed) 
    if(nCovj> 0){ 
        dat = hdpGLM_simulateData_main(n, K, nCov, nCovj=nCovj, J=J,
                                       parameters=parameters, pi=pi,
                                       family=family, same.K, seed=seed, 
                                       context.effect,
                                       same.clusters.across.contexts=same.clusters.across.contexts,
                                       context.dependent.cluster) 
    }
    return(dat)
}


## {{{ docs }}}

#' Simulate the parameters of the model
#'
#' This function generates parameters that can be used to simulate data sets from the Hierarchical Dirichlet Process of Generalized Linear Model (hdpGLM) or dpGLM
#'
#' @param K integer, the number of clusters. If there are multiple contexts, K is the average number of clusters across contexts, and each context gets a number of clusters sampled from a Poisson distribution, except if \code{same.K} is \code{TRUE}.
#' @param nCov integer, the number of covariates of the
#'             GLM components
#' @param nCovj an integer indicating the number of covariates determining the average parameter of the base measure of the Dirichlet process prior
#' @param J an integer representing the number of contexts  @param parameters either NULL or a list with the parameters  to generate the model. If not NULL, it must  contain a sublist name beta, a vector named tau,  and a vector named pi. The sublist beta must be  a list of vectors, each one with size nCov+1  to be the coefficients of the GLM mixtures  components that will generate the data.  For the vector tau, if nCovj=0 (single-context case)  then it must be a 1x1 matrix containing 1.  If ncovj>0, it must be a (nCov+1)x(nCovj+1) matrix.  The vector pi must add up to 1 and have length K.
#' @param pi either NULL or a vector with length K that add up to 1. If not NULL, it determines the mixture probabilities
#' @param same.K boolean, used when data is sampled from more than one context. If \code{TRUE} all contexts get the same number of clusters. If \code{FALSE}, each context gets a number of clusters sampled from a Poisson distribution with expectation equals to \code{K} (current not implemented)
#' @param seed a seed for \code{set.seed}
#' @param context.effect either \code{NULL} or a two dimensional integer vector. If it is \code{NULL}, all the coefficients (\code{beta}) of the individual level covariates are functions of context-level features (\code{tau}). If it is not \code{NULL}, the first component of the vector indicates the index of the lower level covariate (\code{X}) whose linear effect \code{beta} depends on context (\code{tau}) (0 is the intercept). The second component indicates the index context-level covariate (\code{W}) whose linear coefficient (\code{tau}) is non-zero.
#' @param same.clusters.across.contexts boolean, if \code{TRUE} all the contexts will have the same number of clusters AND each cluster will have the same coefficient \code{beta}.
#' @param context.dependent.cluster integer, indicates which cluster will be context-dependent. If \code{zero}, all clusters will be context-dependent 
#'
#' @return The function returns a list with the parameters used to generate data sets from the hdpGLM model. This list can be used in the function \code{hdpGLM_simulateData}
#' 
#' @examples
#' pars = hdpGLM_simParameters(nCov=2, K=2, nCovj=3, J=20,
#'           same.clusters.across.contexts=FALSE, context.dependent.cluster=0) 
#'
#' @export
## }}}
hdpGLM_simParameters <- function(K, nCov=2, nCovj=0, J=1, pi=NULL,
                                 same.K=FALSE , seed=NULL , context.effect=NULL ,
                                same.clusters.across.contexts=NULL,
                                context.dependent.cluster=NULL)
{
    if (is.null(seed)) {
        seed=sample(1:777,1)
    }
    if (is.null(context.dependent.cluster)) {
        context.dependent.cluster=0
    }
    if(nCovj==0 | J < 2) 
        param = dpGLM_simulateParameters(nCov, nCovj=nCovj, K=K, pi=pi,
                                       seed=seed)
    if(nCovj> 0){ 
        param = hdpGLM_simulateParameters(nCov, K=K, nCovj=nCovj,
                                          J=J, pi=pi,
                                          same.K, seed=seed,
                                          context.effect=context.effect,
                                          same.clusters.across.contexts,
                                          context.dependent.cluster)
        }
    return(param)
}

Try the hdpGLM package in your browser

Any scripts or data that you put into this service are public.

hdpGLM documentation built on Oct. 13, 2023, 1:17 a.m.