##Auto
CorrAR <- function(dim, rho){
Corr <- abs(matrix(rep(1:dim, length = dim) - rep(1:dim, each = dim), nrow = dim))
Corr <- rho^Corr
return(Corr)
}
##Compound
CorrCS <- function(dim, rho){
Corr <- matrix(rho, nrow = dim, ncol = dim)
diag(Corr) <- 1
return(Corr)
}
#'
#' Simulation model1 for longitudinal composition data
#'
#' Simulate sparse observation from longitudinal compositional data \eqn{X}.
# @usage
# Model <- function(n, p, m = 0, intercept = TRUE,
# interval = c(0, 1), ns = 100, obs_spar = 0.6,
# discrete = FALSE, SNR = 1, sigma = 2,
# rho_X, Corr_X = c("CorrAR", "CorrCS"),
# rho_W, Corr_W = c("CorrAR", "CorrCS"),
# Nzero_group = 4,
# range_beta = c(0.5, 1), range_beta_c = 1,
# beta_C , theta.add = c(1, 2, 5, 6), gamma = 0.5,
# basis_W = c("bs", "OBasis", "fourier"),
# df_W = 5, degree_W = 3,
# basis_beta = c("bs", "OBasis", "fourier"),
# df_beta = 5, degree_beta = 3,
# insert = c("FALSE", "basis"), method = c("trapezoidal", "step"))
#
#'
#'
#'
#'
#' @param n sample size
#'
#' @param p size of compositional predictors falling in \eqn{S^p}
#'
#' @param intercept including intercept or not to generate response variable, default is TRUE
#'
#' @param m size of time-invariant predictors. First \code{ceiling(m/2)} columns are generated by \code{bin(1,0.5)} independently;
#' latter \code{(m - ceiling(m/2))} columns are generated \code{norm(0, 1)} independently.
#'
#' @param ns \code{ns} is a scaler specifying length of equally spaced time sequence on domian \code{interval}.
#'
#' @param discrete is logical, specifying whether \eqn{X} is generated at different time points.
#' If \code{distrete = TRUE}, generate \eqn{X} on dense sequence created by \code{max(ns_dense = 1000, 5*ns)} and Then for
#' each subject, random sample \code{ns} points. recommend \code{ns < 200} when \code{distrete = TRUE}.
#'
#' @param interval a length 2 vector indicating time domain.
#'
#' @param obs_spar a percentage used to get sparse ovbservation. Each time point probability \code{obs_spar} to be observed. It allows different subject with
#' different observed time points and size.
#' \code{obs_spar * ns > 5} is required.
#'
#' @param SNR signal to noise ratio.
#'
#' @param sigma,rho_X,Corr_X,rho_W,Corr_W linear combination scaler \code{W}, \code{df_W*p}, is generated from from Multivariate Normal Distribution with mean 0's and
#' ovariance matrix = \code{simga^2 * kronecker(Sigma_X, Sigma_W)}.
#' \code{Corr_X} is correlation structure of \code{Sigma_X} with \eqn{\rho} = \code{rho_X},
#' which controls canonical-correlation between groups for W;
#' \code{Corr_W} is correlation structure of \code{Sigma_W} with \eqn{\rho} = \code{rho_W},
#' which controls correlation within groups of W.
#'
#'
#' @param Nzero_group a even scaler. First \code{Nzero_group} compositional predictors are considered having none zero effect, while others are with 0 coefficients.
#'
#' @param range_beta a sorted vector of length 2 used to generate coefficient matrix \code{B} for compositional predict, which is with demension \code{p*k}.
#' For each column of \code{B}, generate \code{Nzero_group/2} from unifom distribution with range \code{range_beta}, and together with their negatives
#' are ramdom assigned to the first \code{Nzero_group} rows.
#'
#' @param range_beta_c value of coefficients for beta0 and beta_c (coefficients for time-invariant predictors)
#' @param beta_C vectorized coefficients of coefficient matrix for compositional predictors. Could be missing.
#'
#' @param theta.add logical or numerical. If numerical, indicating which ones of compositional predictors of high level mean.
#' If logical, \code{c(1:ceiling(Nzero_group/2), Nzero_group + (1:ceiling(Nzero_group/2)))} are set to with high level mean
#'
#' @param gamma high level mean groups adding log(p * gamma) before convertint into compositional data, otherwise 0.
#'
#' @param basis_W,df_W,degree_W longitudinal compositional data is generated from linear combination of basis \eqn{\Psi(t)}, take exponetial and change into compositional data.
#' \itemize{
#' \item \code{basis_W} is the basis function for \eqn{\Psi(t)} - default is \code{"bs"}.
#' Other choise are \code{"OBasis"} and \code{"fourier"};
#' \item \code{df_W} is the degree of freedom for basis \eqn{\Psi(t)} - default is 10 ;
#' \item \code{degree_W} the is degree for \eqn{\Psi(t)} - default is 3.
#' }
#'
#'@param basis_beta,df_beta,degree_beta coefficinet curve is generate by linear combination of basis \eqn{\Phi(t)}.
#' \itemize{
#' \item \code{basis_beta} is the basis function for \eqn{\Phi(t)} - default is \code{"bs"}.
#' Other choise are \code{"OBasis"} and \code{"fourier"};
#' \item \code{df_beta} is the degree of freedom for basis \eqn{\Phi(t)} - default is 5;
#' \item \code{degree_beta} is the degree for \eqn{\Phi(t)} - default is 3.
#' }
#'
#' @param insert way to interpolation.
#' \itemize{
#' \item \code{"FALSE"} no interpolation.
#' \item \code{"basis"} compositional data is considered as step function, imposing basis on un-observed time points for each subject.
#' }
#' Default is \code{"FALSE"}.
#'
#' @inheritParams FuncompCGL
#'
#' @return a list
#' \item{data}{a list, a vector \code{y} of response variable, a data frame \code{Comp} of sparse observation of longitudinal compositional data,
#' a matrix \code{Zc} of time-invariable predictors, a logtical \code{intercept}.}
#' \item{beta}{a length \code{p*df_beta + m + 1} vector for coefficients}
#' \item{basis.info}{ matrix for basis for beta, combining the first column as time sequence.}
#' \item{data.raw}{ a list, \code{Z_t.full} full observation of longitudinal compositional data,
#' \code{Z_ITG} integral for full observation of longitudinal compositional data,
#' \code{Y.tru} true response without noise,
#' \code{X} longitudinal before converting into compositional data,
#' \code{W} matrix of linear combination scalers, \code{n * (df_W * p)}.}
#'
#' \item{parameter}{ a list of parameters.}
#'
#'
#' @examples
#'
#' df_beta = 5
#' p = 20
#' Data <- Model(n = 50, p = p, m = 2, intercept = TRUE, ns = 50, SNR = 1,
#' rho_X = 0.1, rho_W = 0.2, df_W = 10, df_beta = df_beta, obs_spar = 0.5)
#' names(Data$data)
#' Data.test <- Model(n = 50, p = p, m = 2, intercept = TRUE, ns = 50, SNR = 1,
#' rho_X = 0.1, rho_W = 0.2, df_W = 10, df_beta = df_beta, obs_spar = 0.5,
#' beta_C = Data$beta[1:(p*df_beta)])
#'
#'
#' @export
#'
#' @importFrom MASS mvrnorm
#' @importFrom plyr alply ldply
#' @importFrom utils tail
#'
Model <- function(n, p, m = 0, intercept = TRUE,
interval = c(0, 1), ns = 100, obs_spar = 0.6, discrete = FALSE,
SNR = 1, sigma = 2,
rho_X, Corr_X = c("CorrAR", "CorrCS"),
rho_W, Corr_W = c("CorrAR", "CorrCS"),
Nzero_group = 4,
range_beta = c(0.5, 1), range_beta_c = 1,
beta_C ,
theta.add = c(1, 2, 5, 6), gamma = 0.5,
basis_W = c("bs", "OBasis", "fourier"), df_W = 5, degree_W = 3,
basis_beta = c("bs", "OBasis", "fourier"), df_beta = 5, degree_beta = 3,
insert = c("FALSE", "basis"), method = c("trapezoidal", "step")) {
#### Warnings ####
Corr_X <- match.arg(Corr_X)
Corr_W <- match.arg(Corr_W)
basis_W <- match.arg(basis_W)
insert <- match.arg(insert)
method <- match.arg(method)
basis_beta <- match.arg(basis_beta)
if(obs_spar > 1 || obs_spar < 0) stop("obs_spar is a percentage")
if(basis_W %in% c("bs", "OBasis") && df_W < 4) {
warning("With B-spline basis including intercept DF at least 4")
df_W <- 4 }
if(basis_beta %in% c("bs", "OBasis") && df_beta < 4) {
warning("With B-spline basis including intercept DF at least 4")
df_beta <- 4 }
if(basis_W == "fourier" && df_W %% 2 == 0) {
warning("With Fourier basis, DF needs to be odd")
df_W <- df_W - 1 }
if(basis_beta == "fourier" && df_beta %% 2 == 0) {
warning("With Fourier basis, DF needs to be odd")
df_beta <- df_beta - 1 }
#### Warnings ####
#### Coefficients: beta_C generation.####
## beta_C: df = df_beta, degree = degree_beta
if(missing(beta_C)) {
C <- matrix(0, nrow = p, ncol = df_beta)
for(j in 1:df_beta){
col_input <- round(runif((Nzero_group/2), min = range_beta[1], max = range_beta[2]),
digits = 2)
col_input <- c(col_input, -col_input)
col_permuation <- sample(Nzero_group)
C[1:Nzero_group, j] <- col_input[col_permuation]
}
beta_C <- as.vector(t(C))
}
#### Coefficients: beta and beta_c generation ####
#### Mean pattern #####
add.on <- 0
if (theta.add && !is.numeric(theta.add)) {
## If true, first ceiling(Nzero_group/2) of the None-zero groups are set mean with log(p*gamma)
## first ceiling(Nzero_group/2) of the zero groups are set mean with log(p*gamma)
add.on <- c(1:ceiling(Nzero_group/2), Nzero_group + (1:ceiling(Nzero_group/2)))
} else if (is.numeric(theta.add)) {
if(length(theta.add) > p) stop("None zero theta must be smaller than p")
add.on <- theta.add
}
#### Mean pattern #####
#### time points and basis functions ####
ns_dense <- ns
if(discrete) ns_dense <- max(1000, 5*ns)
sseq <- round(seq(0, 1, length.out = ns_dense), 10)
if(ns * obs_spar < 5) stop("Too Sparse")
W_nknots <- df_W - (degree_W + 1)
if(W_nknots > 0) {
W_knots <- ((1:W_nknots) / (W_nknots + 1)) * diff(interval) + interval[1]
} else {
W_knots <- NULL
}
beta_nknots <- df_beta - (degree_beta + 1)
if(beta_nknots > 0) {
beta_knots <- ((1:beta_nknots) / (beta_nknots + 1)) * diff(interval) + interval[1]
} else {
beta_knots <- NULL
}
beta.basis <- switch(basis_beta,
"bs" = bs(x = sseq, df = df_beta, degree = degree_beta,
Boundary.knots = interval, intercept = TRUE),
"fourier" = eval.basis(sseq, basisobj = create.fourier.basis(rangeval = interval, nbasis = df_beta )),
"OBasis" = evaluate(OBasis(expand.knots(c(interval[1], beta_knots, interval[2])),
order = degree_beta + 1),
sseq)
)
W.basis <- switch(basis_W,
"bs" = bs(x = sseq, df = df_W, degree = degree_W,
Boundary.knots = interval, intercept = TRUE),
"fourier" = eval.basis(sseq, basisobj = create.fourier.basis(rangeval = interval, nbasis = df_W )),
"OBasis" = evaluate(OBasis(expand.knots(c(interval[1], W_knots, interval[2])),
order = degree_W + 1),
sseq)
)
#### time points and basis functions ####
#### Correlation matrix: CorrMIX, of linear combination coefficients vector W ####
##X.sigma is between group (Compositional variables) correlation for W vector
##W.sigma is within group (linear combination coeffcients for each compsitional varaible) correlation for W vector
##CovMIX = sigma^2 * kronecker(X.Sigma, W.Sigma,)
X.Sigma <- do.call(Corr_X, list(dim = p, rho = rho_X))
W.Sigma <- do.call(Corr_W, list(dim = df_W, rho = rho_W))
CovMIX <- sigma^2 * kronecker(X.Sigma, W.Sigma) #sigma*kronecker(W.Sigma, X.Sigma)
#### Correlation matrix: CorrMIX ####
#### linear combination coefficients vector: W generation. Mean variation included ####
#=================================================================================#
#== Mean is log(gamma*p)/0 for X_j(t) if basis generated from bs with Intercept ==#
#== since mean(X_j(t))= W.linear[t, ] %*% theat_j= log(gamma*p)/0 * sum(W.linear[t, ]) ==#
#== sum(W.linear[t, ])= 1, other basis might not be true ==#
#=================================================================================#
mean_curve <- rep(0
#log(p*gamma / 2)
, times = p * df_W)
group2 <- matrix(1:(df_W * p), nrow = df_W)
b <- tail(1:p, floor(p/2))
mean_curve[as.vector(group2[, add.on])] <- mean_curve[as.vector(group2[, add.on])] + log(p*gamma)
#W.linear <- mvrnorm(n, rep(0, times = p * df_W), CovMIX)
W.linear <- mvrnorm(n, mean_curve, CovMIX)
#### linear combination coefficients vector: W ####
#### longitinal composition: X.comp_full generation ####
X <- array(dim = c(n, p, ns_dense), NA)
dimnames(X)[[2]] <- paste0("X", 1:p)
X.comp <- X
I <- diag(p)
for(s in 1:ns_dense) {
W.Phi <- kronecker(I, t(W.basis[s, ]))
## X curve follows normal distribution
X[, , s] <- t( apply(W.linear, 1, function(x, W.Phi) W.Phi %*% x, W.Phi = W.Phi) ) #?
#X[, add.on , s] <- X[, add.on , s] + log(gamma * p)
## X.comp follows a logistic normal distribution for each time point after exp(x(s))/sum(exp(x(s)))
X.comp[, , s] <- exp(X[, , s])
X.comp[, , s] <- X.comp[, , s] / rowSums(X.comp[, , s])
#plot(X.comp[1, , s])
## Under threshold, measurement could not be detected and recorded as 0.
## If threshold=0, all measurement detected.
## 0's are replaced by replace value
##X.comp[, , s][X.comp[, , s] <= threshold] <- replace
}
#### longitinal composition: X.comp_full ####
#### Integration:Z_ITG ####
#X.comp_log <- log(X.comp)
#DD <- alply(X.comp_log, .margins = 1,
# function(x, sseq) data.frame(t(rbind(TIME = sseq, x))),
# sseq = sseq )
D <- alply(X.comp, .margins = 1,
function(x, sseq) data.frame(t(rbind(TIME = sseq, x))),
sseq = sseq )
if(discrete) {
D <- lapply(D, function(x, ns, ns_dense) {
T.obs <- sample(ns_dense, ns)
T.obs <- sort(T.obs)
x <- x[T.obs, ]
return(x)
}, ns = ns, ns_dense = ns_dense)
}
Z_t.full <- ldply(D[1:n], data.frame, .id = "Subject_ID")
Z_ITG <- sapply(D, function(x, sseq, beta.basis, insert, method){
x[, -1] <- log(x[, -1])
ITG(X = x, basis = beta.basis, sseq = sseq, insert = insert, method = method)$integral
} ,sseq = sseq, beta.basis = beta.basis, insert = insert, method = method)
Z_ITG <- t(Z_ITG)
#### Integration:Z_ITG ####
#### Control varaible combining Intercept: Z_c, generation ####
## beta_c and beta0
beta0 <- range_beta_c
if(m > 0) {
Z_control <- matrix(NA, nrow = n, ncol = m)
## ceiling(m/2) is generated by norm;
## (m - ceiling(m/2)) is generated by binary with prob = 0.5
Z_control[, (floor(m/2) + 1):m] <- matrix(rnorm(n * length((floor(m/2) + 1):m)),
nrow = n)
Z_control[, -((floor(m/2) + 1):m)] <- matrix(sample(c(0,1), n * floor(m/2), replace = TRUE, prob = c(0.5, 0.5)),
nrow = n)
## Combine Intercept with control variables
beta_c <- rep(range_beta_c, times = m)
} else {
Z_control <- matrix(0, nrow = n, ncol = 1)
beta_c <- 0
}
#### Control varaible combining Intercept ####
#### Response: y, generation ####
Y.tru <- Z_control %*% beta_c + Z_ITG %*% beta_C + ifelse(intercept, beta0, 0)
sd_ratio <- sd( Z_control * beta_c) / sd( Z_ITG %*% beta_C)
error <- rnorm(n, 0, 1)
sigma_e <- sd(Y.tru) / (sd(error) * SNR)
error <- sigma_e*error
Y.obs <- Y.tru + error
y_sd <- sd(Y.obs)
#### Response: y ####
#### Compositioanal Data output: Z_t.full and Z_t.obs ####
## Each time point is with prob obs_spar to be observed, #obs follows bin(n, obs_spar)
## Different Subject with varying #observations and observed time points
if(obs_spar == 1) {
Z_t.obs <- Z_t.full
} else {
Z_t.obs <- lapply(D, function(x, obs_spar) {
n <- dim(x)[1]
#lambda <- obs_spar * n
#n.obs <- rpois(1, lambda)
#T.obs <- sample(n, size = n.obs)
T.obs <- replicate(n, sample(c(0,1), 1, prob = c(1 - obs_spar, obs_spar)))
T.obs <- which(T.obs == 1)
x.obs <- x[T.obs, ]
return(x.obs)
}, obs_spar = obs_spar)
Z_t.obs <- plyr::ldply(Z_t.obs, data.frame, .id = "Subject_ID")
}
#### Compositioanal Data output: Z_t.full and Z_t.obs ####
#### Output ####
if(m == 0) {
Z_control <- NULL
beta_c <- NULL}
data <- list(y = Y.obs, Comp = Z_t.obs, Zc = Z_control, intercept = intercept)
beta <- c(beta_C, beta_c, ifelse(intercept, beta0, 0))
data.raw <- list(Z_t.full = Z_t.full, Z_ITG = Z_ITG,
Y.tru = Y.tru, X = X, W = W.linear)
basis.info <- cbind(sseq, beta.basis)
parameter <- list(p = p, n = n, m = m,
k = df_beta, Jx = df_W, ns = ns, basis_W = basis_W, basis_beta = basis_beta,
rho_X = rho_X, rho_W = rho_W, Corr_X = Corr_X, Corr_W = Corr_W, sigma = sigma,
degree_W = degree_W, degree_beta = degree_beta,
SNR = SNR, sigma_e = sigma_e, sd_ratio = sd_ratio, y_sd = y_sd,
theta.add = add.on, obs_spar = obs_spar)
output <- list(data = data, beta = beta, basis.info = basis.info, data.raw = data.raw,
parameter = parameter)
#### Output ####
return(output)
}
#'
#' Simulation model2 for longitudinal composition data
#'
#' Simulate sparse observation from longitudinal compositional data \eqn{X}.
#'
# @usage
# Model2 <- function(n, p, m = 0, intercept = TRUE,
# interval = c(0, 1), ns = 100, obs_spar = 0.6, discrete = FALSE,
# SNR = 1, sigma = 2,
# rho_X, Corr_X = c("CorrAR", "CorrCS"),
# rho_W, Corr_W = c("CorrAR", "CorrCS"),
# Nzero_group = 4,
# range_beta = c(0.5, 1), range_beta_c = 1, beta_C ,
# theta.add = c(1, 2, 5, 6), gamma = 0.5,
# basis_W = c("bs", "OBasis", "fourier"), df_W = 5, degree_W = 3,
# basis_beta = c("bs", "OBasis", "fourier"),
# df_beta = 5, degree_beta = 3,
# insert = c("FALSE", "basis"), method = c("trapezoidal", "step"))
#
#'
#' @param n sample size
#'
#' @param p size of compositional predictors falling in \eqn{S^p}
#'
#' @param intercept including intercept or not to generate response variable, default is TRUE
#'
#' @param m size of time-invariant predictors. First \code{ceiling(m/2)} columns are generated by \code{bin(1,0.5)} independently;
#' latter \code{(m - ceiling(m/2))} columns are generated \code{norm(0, 1)} independently.
#'
#' @param ns \code{ns} is a scaler specifying length of equally spaced time sequence on domian \code{interval}.
#'
#' @param discrete is logical, specifying whether \eqn{X} is generated at different time points.
#' If \code{distrete = TRUE}, generate \eqn{X} on dense sequence created by \code{max(ns_dense = 1000, 5*ns)} and Then for
#' each subject, random sample \code{ns} points. recommend \code{ns < 200} when \code{distrete = TRUE}.
#'
#' @param interval a length 2 vector indicating time domain.
#'
#' @param obs_spar a percentage used to get sparse ovbservation. Each time point probability \code{obs_spar} to be observed. It allows different subject with
#' different observed time points and size.
#' \code{obs_spar * ns > 5} is required.
#'
#' @param SNR signal to noise ratio.
#'
#' @param sigma,rho_X,Corr_X,rho_W,Corr_W linear combination scaler \code{W}, \code{df_W*p}, is generated from from Multivariate Normal Distribution with mean 0's and
#' ovariance matrix = \code{simga^2 * kronecker(Sigma_X, Sigma_W)}.
#' \code{Corr_X} is correlation structure of \code{Sigma_X} with \eqn{\rho} = \code{rho_X},
#' which controls canonical-correlation between groups for W;
#' \code{Corr_W} is correlation structure of \code{Sigma_W} with \eqn{\rho} = \code{rho_W},
#' which controls correlation within groups of W.
#'
#'
#' @param Nzero_group a even scaler. First \code{Nzero_group} compositional predictors are considered having none zero effect, while others are with 0 coefficients.
#'
#' @param range_beta a sorted vector of length 2 used to generate coefficient matrix \code{B} for compositional predict, which is with demension \code{p*k}.
#' For each column of \code{B}, generate \code{Nzero_group/2} from unifom distribution with range \code{range_beta}, and together with their negatives
#' are ramdom assigned to the first \code{Nzero_group} rows.
#'
#' @param range_beta_c value of coefficients for beta0 and beta_c (coefficients for time-invariant predictors)
#' @param beta_C vectorized coefficients of coefficient matrix for compositional predictors. Could be missing.
#'
#' @param theta.add logical or numerical. If numerical, indicating which ones of compositional predictors of high level mean.
#' If logical, \code{c(1:ceiling(Nzero_group/2), Nzero_group + (1:ceiling(Nzero_group/2)))} are set to with high level mean
#'
#' @param gamma high level mean groups adding log(p * gamma) before convertint into compositional data, otherwise 0.
#'
#' @param basis_W,df_W,degree_W longitudinal compositional data is generated from linear combination of basis \eqn{\Psi(t)}, take exponetial and change into compositional data.
#' \itemize{
#' \item \code{basis_W} is the basis function for \eqn{\Psi(t)} - default is \code{"bs"}.
#' Other choise are \code{"OBasis"} and \code{"fourier"};
#' \item \code{df_W} is the degree of freedom for basis \eqn{\Psi(t)} - default is 10 ;
#' \item \code{degree_W} the is degree for \eqn{\Psi(t)} - default is 3.
#' }
#'
#'@param basis_beta,df_beta,degree_beta coefficinet curve is generate by linear combination of basis \eqn{\Phi(t)}.
#' \itemize{
#' \item \code{basis_beta} is the basis function for \eqn{\Phi(t)} - default is \code{"bs"}.
#' Other choise are \code{"OBasis"} and \code{"fourier"};
#' \item \code{df_beta} is the degree of freedom for basis \eqn{\Phi(t)} - default is 5;
#' \item \code{degree_beta} is the degree for \eqn{\Phi(t)} - default is 3.
#' }
#'
#' @param insert way to interpolation.
#' \itemize{
#' \item \code{"FALSE"} no interpolation.
#' \item \code{"basis"} compositional data is considered as step function, imposing basis on un-observed time points for each subject.
#' }
#' Default is \code{"FALSE"}.
#'
#' @inheritParams FuncompCGL
#'
#' @return a list
#' \item{data}{a list, a vector \code{y} of response variable, a data frame \code{Comp} of sparse observation of longitudinal compositional data,
#' a matrix \code{Zc} of time-invariable predictors, a logtical \code{intercept}.}
#' \item{beta}{a length \code{p*df_beta + m + 1} vector for coefficients}
#' \item{basis.info}{ matrix for basis for beta, combining the first column as time sequence.}
#' \item{data.raw}{ a list, \code{Z_t.full} full observation of longitudinal compositional data,
#' \code{Z_ITG} integral for full observation of longitudinal compositional data,
#' \code{Y.tru} true response without noise,
#' \code{X} longitudinal before converting into compositional data,
#' \code{W} matrix of linear combination scalers, \code{n * (df_W * p)}.}
#'
#' \item{parameter}{ a list of parameters.}
#'
#'
#' @examples
#'
#' df_beta = 5
#' p = 20
#' Data <- Model2(n = 50, p = p, m = 2, intercept = TRUE, ns = 50, SNR = 1,
#' rho_X = 0.1, rho_W = 0.2, df_W = 10, df_beta = df_beta, obs_spar = 0.5)
#' names(Data$data)
#' Data.test <- Model2(n = 50, p = p, m = 2, intercept = TRUE, ns = 50, SNR = 1,
#' rho_X = 0.1, rho_W = 0.2, df_W = 10, df_beta = df_beta, obs_spar = 0.5,
#' beta_C = Data$beta[1:(p*df_beta)])
#'
#'
#' @export
#'
#' @importFrom MASS mvrnorm
#' @importFrom plyr alply ldply
#' @importFrom utils tail
#'
Model2 <- function(n, p, m = 0, intercept = TRUE,
interval = c(0, 1), ns = 100, obs_spar = 0.6, discrete = FALSE,
SNR = 1, sigma = 2,
rho_X, Corr_X = c("CorrAR", "CorrCS"),
rho_W, Corr_W = c("CorrAR", "CorrCS"),
Nzero_group = 4,
range_beta = c(0.5, 1), range_beta_c = 1,
beta_C ,
theta.add = c(1, 2, 5, 6), gamma = 0.5,
basis_W = c("bs", "OBasis", "fourier"), df_W = 5, degree_W = 3,
basis_beta = c("bs", "OBasis", "fourier"), df_beta = 5, degree_beta = 3,
insert = c("FALSE", "basis"), method = c("trapezoidal", "step")) {
#### Warnings ####
Corr_X <- match.arg(Corr_X)
Corr_W <- match.arg(Corr_W)
basis_W <- match.arg(basis_W)
insert <- match.arg(insert)
method <- match.arg(method)
basis_beta <- match.arg(basis_beta)
this.call <- match.call()
if(obs_spar > 1 || obs_spar < 0) stop("obs_spar is a percentage")
if(basis_W %in% c("bs", "OBasis") && df_W < 4) {
warning("With B-spline basis including intercept DF at least 4")
df_W <- 4 }
if(basis_beta %in% c("bs", "OBasis") && df_beta < 4) {
warning("With B-spline basis including intercept DF at least 4")
df_beta <- 4 }
if(basis_W == "fourier" && df_W %% 2 == 0) {
warning("With Fourier basis, DF needs to be odd")
df_W <- df_W - 1 }
if(basis_beta == "fourier" && df_beta %% 2 == 0) {
warning("With Fourier basis, DF needs to be odd")
df_beta <- df_beta - 1 }
#### Warnings ####
#### Coefficients: beta_C generation.####
## beta_C: df = df_beta, degree = degree_beta
if(missing(beta_C)) {
C <- matrix(0, nrow = p, ncol = df_beta)
for(j in 1:df_beta){
col_input <- round(runif((Nzero_group/2), min = range_beta[1], max = range_beta[2]),
digits = 2)
col_input <- c(col_input, -col_input)
col_permuation <- sample(Nzero_group)
C[1:Nzero_group, j] <- col_input[col_permuation]
}
beta_C <- as.vector(t(C))
}
#### Coefficients: beta and beta_c generation ####
#### Mean pattern #####
add.on <- 0
if (theta.add && !is.numeric(theta.add)) {
## If true, first ceiling(Nzero_group/2) of the None-zero groups are set mean with log(p*gamma)
## first ceiling(Nzero_group/2) of the zero groups are set mean with log(p*gamma)
add.on <- c(1:ceiling(Nzero_group/2), Nzero_group + (1:ceiling(Nzero_group/2)))
} else if (is.numeric(theta.add)) {
if(length(theta.add) > p) stop("None zero theta must be smaller than p")
add.on <- theta.add
}
#### Mean pattern #####
#### time points and basis functions ####
ns_dense <- ns
if(discrete) ns_dense <- max(1000, 5*ns)
sseq <- round(seq(0, 1, length.out = ns_dense), 10)
if(ns * obs_spar < 5) stop("Too Sparse")
W_nknots <- df_W - (degree_W + 1)
if(W_nknots > 0) {
W_knots <- ((1:W_nknots) / (W_nknots + 1)) * diff(interval) + interval[1]
} else {
W_knots <- NULL
}
beta_nknots <- df_beta - (degree_beta + 1)
if(beta_nknots > 0) {
beta_knots <- ((1:beta_nknots) / (beta_nknots + 1)) * diff(interval) + interval[1]
} else {
beta_knots <- NULL
}
beta.basis <- switch(basis_beta,
"bs" = bs(x = sseq, df = df_beta, degree = degree_beta,
Boundary.knots = interval, intercept = TRUE),
"fourier" = eval.basis(sseq, basisobj = create.fourier.basis(rangeval = interval, nbasis = df_beta )),
"OBasis" = evaluate(OBasis(expand.knots(c(interval[1], beta_knots, interval[2])),
order = degree_beta + 1),
sseq)
)
# W.basis <- switch(basis_W,
# "bs" = bs(x = sseq, df = df_W, degree = degree_W,
# Boundary.knots = interval, intercept = TRUE),
#
# "fourier" = eval.basis(sseq, basisobj = create.fourier.basis(rangeval = interval, nbasis = df_W )),
#
# "OBasis" = evaluate(OBasis(expand.knots(c(interval[1], W_knots, interval[2])),
# order = degree_W + 1),
# sseq)
#
# )
#### time points and basis functions ####
#### Correlation matrix: CorrMIX, of linear combination coefficients vector W ####
##X.sigma is between group (Compositional variables) correlation for W vector
##W.sigma is within group (linear combination coeffcients for each compsitional varaible) correlation for W vector
##CovMIX = sigma^2 * kronecker(X.Sigma, W.Sigma,)
X.Sigma <- do.call(Corr_X, list(dim = p, rho = rho_X))
##Now use this for the correlation over time
W.Sigma <- do.call(Corr_W, list(dim = ns_dense, rho = rho_W))
CovMIX <- sigma^2 * kronecker(W.Sigma, X.Sigma) #sigma*kronecker(W.Sigma, X.Sigma)
#### Correlation matrix: CorrMIX ####
#### linear combination coefficients vector: W generation. Mean variation included ####
#=================================================================================#
#== Mean is log(gamma*p)/0 for X_j(t) if basis generated from bs with Intercept ==#
#== since mean(X_j(t))= W.linear[t, ] %*% theat_j= log(gamma*p)/0 * sum(W.linear[t, ]) ==#
#== sum(W.linear[t, ])= 1, other basis might not be true ==#
#=================================================================================#
mean_curve <- rep(0
#log(p*gamma / 2)
, times = p * df_W)
group2 <- matrix(1:(df_W * p), nrow = df_W)
b <- tail(1:p, floor(p/2))
mean_curve[as.vector(group2[, add.on])] <- mean_curve[as.vector(group2[, add.on])] + log(p*gamma)
W.linear <- mvrnorm(n, rep(0, p * ns_dense), CovMIX)
#W.linear <- mvrnorm(n, mean_curve, CovMIX)
#### linear combination coefficients vector: W ####
#### longitinal composition: X.comp_full generation ####
X <- array(dim = c(n, p, ns_dense), NA)
dimnames(X)[[2]] <- paste0("X", 1:p)
X.comp <- X
I <- diag(p)
for(s in 1:n) {
## X curve follows normal distribution
#X[, , s] <- t( apply(W.linear, 1, function(x, W.Phi) W.Phi %*% x, W.Phi = W.Phi) ) #?
X[s, ,] <- matrix(nrow = p, ncol = ns_dense, W.linear[s,],byrow = FALSE)
#X[, add.on , s] <- X[, add.on , s] + log(gamma * p)
## X.comp follows a logistic normal distribution for each time point after exp(x(s))/sum(exp(x(s)))
}
for(s in 1:ns) {
X.comp[, , s] <- exp(X[, , s])
X.comp[, , s] <- X.comp[, , s] / rowSums(X.comp[, , s])
#plot(X.comp[1, , s])
## Under threshold, measurement could not be detected and recorded as 0.
## If threshold=0, all measurement detected.
## 0's are replaced by replace value
##X.comp[, , s][X.comp[, , s] <= threshold] <- replace
}
#### longitinal composition: X.comp_full ####
#### Integration:Z_ITG ####
#X.comp_log <- log(X.comp)
#DD <- alply(X.comp_log, .margins = 1,
# function(x, sseq) data.frame(t(rbind(TIME = sseq, x))),
# sseq = sseq )
D <- alply(X.comp, .margins = 1,
function(x, sseq) data.frame(t(rbind(TIME = sseq, x))),
sseq = sseq )
if(discrete) {
D <- lapply(D, function(x, ns, ns_dense) {
T.obs <- sample(ns_dense, ns)
T.obs <- sort(T.obs)
x <- x[T.obs, ]
return(x)
}, ns = ns, ns_dense = ns_dense)
}
Z_t.full <- ldply(D[1:n], data.frame, .id = "Subject_ID")
Z_ITG <- sapply(D, function(x, sseq, beta.basis, insert, method){
x[, -1] <- log(x[, -1])
ITG(X = x, basis = beta.basis, sseq = sseq, insert = insert, method = method)$integral
} ,sseq = sseq, beta.basis = beta.basis, insert = insert, method = method)
Z_ITG <- t(Z_ITG)
#### Integration:Z_ITG ####
#### Control varaible combining Intercept: Z_c, generation ####
## beta_c and beta0
beta0 <- range_beta_c
if(m > 0) {
Z_control <- matrix(NA, nrow = n, ncol = m)
## ceiling(m/2) is generated by norm;
## (m - ceiling(m/2)) is generated by binary with prob = 0.5
Z_control[, (floor(m/2) + 1):m] <- matrix(rnorm(n * length((floor(m/2) + 1):m)),
nrow = n)
Z_control[, -((floor(m/2) + 1):m)] <- matrix(sample(c(0,1), n * floor(m/2), replace = TRUE, prob = c(0.5, 0.5)),
nrow = n)
## Combine Intercept with control variables
beta_c <- rep(range_beta_c, times = m)
} else {
Z_control <- matrix(0, nrow = n, ncol = 1)
beta_c <- 0
}
#### Control varaible combining Intercept ####
#### Response: y, generation ####
Y.tru <- Z_control %*% beta_c + Z_ITG %*% beta_C + ifelse(intercept, beta0, 0)
sd_ratio <- sd( Z_control * beta_c) / sd( Z_ITG %*% beta_C)
error <- rnorm(n, 0, 1)
sigma_e <- sd(Y.tru) / (sd(error) * SNR)
error <- sigma_e*error
Y.obs <- Y.tru + error
y_sd <- sd(Y.obs)
#### Response: y ####
#### Compositioanal Data output: Z_t.full and Z_t.obs ####
## Each time point is with prob obs_spar to be observed, #obs follows bin(n, obs_spar)
## Different Subject with varying #observations and observed time points
if(obs_spar == 1) {
Z_t.obs <- Z_t.full
} else {
Z_t.obs <- lapply(D, function(x, obs_spar) {
n <- dim(x)[1]
#lambda <- obs_spar * n
#n.obs <- rpois(1, lambda)
#T.obs <- sample(n, size = n.obs)
T.obs <- replicate(n, sample(c(0,1), 1, prob = c(1 - obs_spar, obs_spar)))
T.obs <- which(T.obs == 1)
x.obs <- x[T.obs, ]
return(x.obs)
}, obs_spar = obs_spar)
Z_t.obs <- plyr::ldply(Z_t.obs, data.frame, .id = "Subject_ID")
}
#### Compositioanal Data output: Z_t.full and Z_t.obs ####
#### Output ####
if(m == 0) {
Z_control <- NULL
beta_c <- NULL}
data <- list(y = Y.obs, Comp = Z_t.obs, Zc = Z_control, intercept = intercept)
beta <- c(beta_C, beta_c, ifelse(intercept, beta0, 0))
data.raw <- list(Z_t.full = Z_t.full, Z_ITG = Z_ITG,
Y.tru = Y.tru, X = X, W = W.linear)
basis.info <- cbind(sseq, beta.basis)
parameter <- list(p = p, n = n, m = m,
k = df_beta, Jx = df_W, ns = ns, basis_W = basis_W, basis_beta = basis_beta,
rho_X = rho_X, rho_W = rho_W, Corr_X = Corr_X, Corr_W = Corr_W, sigma = sigma,
degree_W = degree_W, degree_beta = degree_beta,
SNR = SNR, sigma_e = sigma_e, sd_ratio = sd_ratio, y_sd = y_sd,
theta.add = add.on, obs_spar = obs_spar)
output <- list(data = data, beta = beta, basis.info = basis.info, data.raw = data.raw,
parameter = parameter)
output$call <- this.call
#### Output ####
return(output)
}
#'
#' Simulation for composition data
#'
#' Simulate compositional data.
#'
#' @param n sample size
#'
#' @param p size of compositional predictors falling in \eqn{S^p}
#'
#' @param intercept including intercept or not to generate response variable. Default value is FALSE
#'
#' @param rho a scaler used to generate \code{p*p} autocorrelation matrix
#'
#' @param sigma standard deviation for noise terms, which follows iid norm distribution with mean 0
#'
#' @param gamma,add.on To generate compsotional data, we first generate \code{n*p} matrix \code{X} from multivariate normal distribution and
#' than transform it to compostional data.
#' \code{gamma} is a scaler and add.on is a index vector. Assign \code{log(p)*gamma} to means of \code{X} for columns add.on ,
#' while others with 0's.
#'
#' @param beta coefficients for composition variables
#'
#' @param beta0 coefficient for intercept
#'
#'
#' @details
#' The setup of this simulation follows Variable selection in regression with compositional covariates by WEI LIN, PIXU SHI, RUI FENG AND HONGZHE LI.
#'
#' @examples
#'
#' n = 50
#' p = 30
#' rho = 0.2
#' sigma = 0.5
#' gamma = 0.5
#' add.on = 1:5
#' beta = c(1, -0.8, 0.6, 0, 0, -1.5, -0.5, 1.2)
#' beta = c( beta, rep(0, times = p - length(beta)) )
#' intercept = FALSE
#' Comp_data = comp_simulation(n = n, p = p, rho = rho, sigma = sigma, gamma = gamma, add.on = add.on,
#' beta = beta, intercept = intercept)
#'
#' @export
#'
#' @importFrom MASS mvrnorm
#'
comp_simulation <- function(n, p, rho = 0.2, sigma = 0.5, gamma = 0.5, add.on = 1:5,
beta = c( c(1, -0.8, 0.6, 0, 0, -1.5, -0.5, 1.2), rep(0, times = p - 8) ),
beta0 = 1, intercept = FALSE) {
SIGMA <- CorrAR(p, rho)
theta <- rep(0, times = p)
theta[add.on] <- log(gamma * p)
W <- mvrnorm(n, theta, SIGMA)
X <- exp(W)
X.comp <- X / rowSums(X)
Z <- log(X.comp)
Y.tru <- Z %*% beta + as.integer(intercept)*beta0
y <- Y.tru + rnorm(n, 0, sigma)
data <- list(y = y, Z = Z, Zc = NULL, intercept = intercept, beta = beta, X.comp = X.comp)
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.