#' Generate data
#'
#' Produces data from different populations with the probability of
#' belonging to a population. Also produces one discrete covariate and one continuous covariate.
#'
#' @param n sample size, must be at least 1.
#' @param p number of populations, must be at least 2.
#' @param m number of different mixture proportions, must be at least 2.
#' @param qvs a numeric matrix of size \code{p} by \code{m} containing all possible
#' mixture proportions (i.e., the probability of belonging to each population k, k=1,...,p.).
#' @param censoring.rate a scalar indicating the censoring proportion. Options are 0 or 40.
#' @param simu.setting Character indicating simulation setting.
#' Options are "Log-Normal-No-Covariates", "Log-Normal-With-Covariates", "HD-No-Covariates","HD-With-Covariates".
#' Setting "Log-Normal-No-Covariates" and "Log-Normal-With-Covariates" refer to simulation setting 1 in Garcia and Parast (2020).
#' "Log-Normal-No-Covariates" means the
#' survival outcomes do NOT depend on the covariates, and "Log-Normal-With-Covariates" means the
#' survival outcomes do depend on the covariates.
#' Setting "HD-No-Covariates" and "HD-With-Covariates" refer to Simulation setting 2 in Garcia and Parast (2020), "HD-No-Covariates" means the
#' survival outcomes do NOT depend on the covariates, and "HD-With-Covariates" means the
#' survival outcomes do depend on the covariates.
#'
#' @return Returns a list containing
#' \itemize{
#' \item{x: }{a numeric vector of length \code{n} containing the observed event times
#' for each person in the sample.}
#' \item{delta: }{a numeric vector of length \code{n} that denotes
#' censoring (1 denotes event is observed, 0 denotes event is censored).}
#' \item{q: }{a numeric matrix of size \code{p} by \code{n} containing the
#' mixture proportions for each person in the sample.}
#' \item{ww: }{a numeric vector of length \code{n} containing the values of the continuous
#' covariate for each person in the sample.}
#' \item{zz: }{ a numeric vector of length \code{n} containing the values of the discrete
#' covariate for each person in the sample.}
#' \item{true.group.identifier: }{numeric vector of length \code{n} denoting the population identifier for each person in the sample.
#' This is only used simulation study to evaluate the prediction accuracy of our methods.}
#' }
#'
#'@references
#'Garcia, T.P. and Parast, L. (2020). Dynamic landmark prediction for mixture data. Biostatistics, doi:10.1093/biostatistics/kxz052.
#'
#'
#' @export
GenerateData <- function(n,p,m,qvs,censoring.rate,simu.setting){
q <- array(0,dim=c(p,n)) ## mixture proportions
x <- rep(0,n) ## observed event time: min(T,C)
delta <- rep(0,n) ## censoring indicator
uset <- rep(0,n) ## indicator of which subgroup qvs
r <- rep(0,m) ## number in each qvs subgroup
true.group.identifier <- rep(0,n) ## to which group each person belongs
###################
## Simulate data ##
###################
## set evenly spaced subgroup sizes
r0 <- seq(0,1,length=m+1)
for(i in 1:n){
a <- runif(1)
for(j in 1:m){
if( (a >= r0[j]) && (a < r0[j+1]) ){
q[,i] <- qvs[,j]
r[j] <- r[j] + 1
uset[i] <- j
}
}
}
## set covariates
zw.data <- gendata.zw(n,simu.setting)
zz <- zw.data$zz
ww <- zw.data$ww
## set dependence on covariates
covariate.dependent <- set.covariate.dependent(simu.setting)
for(i in 1:n){
q1<- q[,i]
negative_value <- TRUE
while(negative_value==TRUE){
t1 <- trueinvFt(p,ww[i],zz[i],simu.setting,covariate.dependent)
if(all(t1>0)){
negative_value <- FALSE
}
}
a <- runif(1)
tmp <- q1[1]
j <- 1
while(j <= p){
if(a <= tmp){
s <- t1[j]
true.group.identifier[i] <- j
j <- p+1
} else {
tmp <- tmp + q1[j+1]
j <- j+1
}
}
cens <- genc(s,censoring.rate,simu.setting)
x[i] <- min(s,cens)
delta[i] <- which.min(c(cens,s)) - 1
}
list(x=x,delta=delta,q=q,ww=ww,zz=zz,true.group.identifier=true.group.identifier)
}
#########################################
## Function to set covariate.dependent ##
########################################
set.covariate.dependent <- function(simu.setting){
if(simu.setting=="Log-Normal-No-Covariates" | simu.setting=="HD-No-Covariates"){
covariate.dependent <- FALSE
} else {
covariate.dependent <- TRUE
}
return(covariate.dependent)
}
################################
## Functions to generate F(t) ##
################################
## function to get sd's used in F(t)
getsd <- function(p){
sd.use <- seq(1,2,length.out=p)
return(sd.use)
}
mult.w <- function(covariate.dependent){
if(covariate.dependent==TRUE){
out <- 1
} else {
out <- 0
}
return(out)
}
mult.z <- function(covariate.dependent){
if(covariate.dependent==TRUE){
out <- 0.5
} else {
out <- 0
}
return(out)
}
#' @import stats
Ft.form <- function(tt,tt0,ww,zz,p,simu.setting){
out <- rep(0,p)
## set dependence on covariates
covariate.dependent <- set.covariate.dependent(simu.setting)
if(simu.setting == "Log-Normal-No-Covariates" | simu.setting=="Log-Normal-With-Covariates"){
sd.use <- getsd(p)
constant.z <- mult.z(covariate.dependent)
constant.w <- mult.w(covariate.dependent)
for(jj in 1:p){
out[jj] <-
(pnorm(log(tt)- constant.w * ww -
constant.z * zz,mean=0,sd=sd.use[jj])-
pnorm(log(tt0)- constant.w * ww -
constant.z * zz,mean=0,sd=sd.use[jj]))/
(1-pnorm(log(tt0)- constant.w * ww -
constant.z * zz, mean=0,sd=sd.use[jj]))
}
}else if(simu.setting=="HD-No-Covariates" | simu.setting=="HD-With-Covariates"){
out[1] <-
( F1(tt,ww,zz,covariate.dependent)- F1(tt0,ww,zz,covariate.dependent) ) /
( 1 - F1(tt0,ww,zz,covariate.dependent) )
out[2] <-
( F2(tt,ww,zz,covariate.dependent)- F2(tt0,ww,zz,covariate.dependent) ) /
( 1 - F2(tt0,ww,zz,covariate.dependent) )
}
return(out)
}
## Function to produce true F(t)
trueFt <- function(z.use,w.use,tval,tval0,
p,real_data,simu.setting,
Ft.null.theta, method.label){
## storage
##out <- Ft.null.theta$null.theta.nomethod
out <- Ft.null.theta$null.theta$estimator
## set dependence on covariates
covariate.dependent <- set.covariate.dependent(simu.setting)
####################
## simu setting 1 ##
####################
## function to generate T from true F(t)
## Since log(T)=W+0.5Z+N, where N~Normal(0,\sigma_u). Then, T=exp(W+0.5Z+N).
## S(t|z,w,sigma_u) = Pr(T > t |z,w,sigma_u) = Pr{ exp(W+0.5Z+N) > t | z, w, sigma_u}
## = Pr(W+0.5Z+N > log(t)|z,w,sigma_u)
## = Pr(N > log(t) - W - 0.5Z|sigma_u)
## = 1- pnorm(log(t)-W-0.5Z,mean=0,sd=sigma_u)
## So F(t|z,w,u)=pnorm(log(t)-W-0.5Z,mean=0,sd=sigma_u)
## note: pnorm() is the distribution function.
## In the general case,
## pr(T <= t | T> t_0,z,w,sigma_u)= (pr(T<=t|z,w,sigma_u) - pr(T<= t_0|z,w,sigma_u))/pr(T>t_0|z,w,sigma_u)
####################
## simu setting 2 ##
####################
## real data
if(real_data==FALSE){
for(ee in 1:length(method.label)){
for(tt in 1:length(tval)){
for(tt0 in 1:length(tval0)){
if(tval[tt]> tval0[tt0]){
if(method.label[ee]!="NPNA_avg"){
for(zz in 1:length(z.use)){
for(ww in 1:length(w.use)){
out[ee,tt,tt0,zz,ww,] <-
Ft.form(tval[tt],tval0[tt0],
w.use[ww],z.use[zz],p,simu.setting)
}
}
} else {
## NPNA-avg
## generate z,w data
zw.data <- gendata.zw(n=100000,simu.setting)
zz.tmp <- zw.data$zz
ww.tmp <- zw.data$ww
## compute average over (z,w) pairs
avg.tmp <- rep(0,p)
for(ii in 1:length(zz.tmp)){
avg.tmp <- avg.tmp + Ft.form(tval[tt],tval0[tt0],
ww.tmp[ii],zz.tmp[ii],
p,simu.setting)
}
## report the avg
for(zz in 1:length(z.use)){
for(ww in 1:length(w.use)){
out[ee,tt,tt0,zz,ww,] <- avg.tmp/100000
}
}
}
}
}
}
}
}
return(out)
}
## randomly choose subgroup sizes
#' @import stats
random.subgroup.sizes <- function(m,n){
rc <- rep(0,m+1)
rc[2:m] <- round(runif((m-1))*(n-m))
rc[m+1] <- n
rc <- sort(rc)
rc[2:m] <- sort(rc[2:m])+2:m-1
return(rc)
}
## function to generate mixture proportions
set.qvs.values <- function(p,m,qvs.setting,mydata=NULL){
qvs <- matrix(0,nrow=p,ncol=m)
colnames(qvs) <- paste("m",1:m,sep="")
rownames(qvs) <- paste("p",1:p,sep="")
if(qvs.setting=="original"){
qvs <- qvs.values(p,m)
} else if(qvs.setting=="HD"){
##qvs[1,1:m] <- unique(mydata$q1) ## probability of being carrier
qvs[,1:m] <- t(as.matrix(unique(mydata[,paste("q",1:p,sep="")])))
}
}
#' Generate finite set of mixture proportions
#'
#' Produces the finite set of mixture proportions for simulated data.
#'
#' @param p number of populations, must be at least 2.
#' @param m number of different mixture proportions, must be at least 2.
#'
#' @return Returns a \code{p} by \code{m} matrix of mixture proportions.
#'
#' @export
qvs.values <- function(p,m){
qvs <- matrix(0,nrow=p,ncol=m)
colnames(qvs) <- paste("m",1:m,sep="")
rownames(qvs) <- paste("p",1:p,sep="")
tmp <- 0.2
qvs[1,1] <- 1
qvs[1,2] <- (1+tmp)/2
qvs[1,3] <- tmp
qvs[1,m] <- 1+tmp*tmp/4-tmp/2-3/4
qvs[1,] <- sort(qvs[1,])
qvs[2,] <- 1-qvs[1,]
return(qvs)
}
## function for censoring
#' @import stats
genc <- function(s,censoring.rate,simu.setting){
if(censoring.rate==0){
out <- s + 1
} else if(censoring.rate==20){
if(simu.setting=="Log-Normal-No-Covariates"){
out <- runif(1,0,25)
} else if(simu.setting=="Log-Normal-With-Covariates"){
out <- runif(1,0,22)
} else if(simu.setting=="HD-No-Covariates"){
#out <- runif(1,50,90)
##out <- runif(1,0,80)
out <- runif(1,0,210)
} else if(simu.setting=="HD-With-Covariates"){
out <- runif(1,0,220)
}
} else if(censoring.rate==40){
if(simu.setting=="Log-Normal-No-Covariates"){
out <- runif(1,0,8)
} else if(simu.setting=="Log-Normal-With-Covariates"){
out <- runif(1,0,4)
} else if(simu.setting=="HD-No-Covariates"){
#out <- runif(1,50,90)
##out <- runif(1,0,80)
out <- runif(1,10,100)
} else if(simu.setting=="HD-With-Covariates"){
out <- runif(1,15,100)
}
}
return(out)
}
############################################
## Functions used to replicate HD setting ##
############################################
F.exp <- function(t,w,z,mu0,mu1,mu2,sigma,r,w.constant){
out <- ( 1+exp( - ((t-mu0)+w.constant * (w-mu1)+mu2*z)/sigma) )^r
return(out)
}
normalizing.constant <- function(t.low,t.high,w,z,mu0,mu1,mu2,sigma,r,a,w.constant){
out <- a*(t.low) + F.exp(t.high,w,z,mu0,mu1,mu2,sigma,r,w.constant)-F.exp(t.low,w,z,mu0,mu1,mu2,sigma,r,w.constant)
return(out)
}
intersection.constant <- function(t.low,w,z,mu0,mu1,mu2,sigma,r,a,p,w.constant){
if(p==2){
out <- a*t.low - F.exp(t.low,w,z,mu0,mu1,mu2,sigma,r,w.constant)
} else {
out <- 0
}
return(out)
}
time.generate <- function(u,t.low,t.high,w,z,mu0,mu1,mu2,sigma,r,a,p,w.constant){
out <- mu0 - w.constant*(w-mu1) - mu2*z -
sigma * log ((u *
normalizing.constant(t.low,t.high,w,z,mu0,mu1,mu2,sigma,r,a,w.constant)-
intersection.constant(t.low,w,z,mu0,mu1,mu2,sigma,r,a,p,w.constant) )^(1/r)-1)
return(out)
}
log.limit <- function(u,t.low,t.high,w,z,mu0,mu1,mu2,sigma,r,a,p,w.constant){
out <- (u * normalizing.constant(t.low,t.high,w,z,mu0,mu1,mu2,sigma,r,
a,w.constant)-
intersection.constant(t.low,w,z,mu0,mu1,mu2,sigma,r,a,
p,w.constant) )^(1/r)-1
return(out)
}
F.constants <- function(p,covariate.dependent=TRUE){
if(p==1){
mu1 <- 40
mu0 <- 43
if(covariate.dependent==TRUE){
## covariates impact onset ages
mu2 <- -0.5
w.constant <- 1
} else {
mu2 <- 0
w.constant <- 0
}
sigma <- 7
a <- 0
t.low <- 0
t.high <- 100
r <- -0.9
} else {
mu1 <- 42
mu0 <- 48
if(covariate.dependent==TRUE){
## covariates impact onset ages
w.constant <- 1
mu2 <- -0.5
} else {
mu2 <- 0
w.constant <- 0
}
sigma <- 10.5
a <- 0.0007
t.low <- 13
t.high <- 100
r <- -2
}
list(mu0=mu0,mu1=mu1,mu2=mu2,sigma=sigma,a=a,t.low=t.low,t.high=t.high,r=r,
w.constant=w.constant)
}
## function to set w
get.w.set <-function(w,mu1,covariate.dependent){
if(covariate.dependent==TRUE){
## survival times depend on covariates
w.set <- w
} else {
## survival times DO NOT depend on covariates
w.set <- mu1
}
return(w.set)
}
F1 <- function(t,w,z,covariate.dependent){
constants <- F.constants(1,covariate.dependent)
mu0 <- constants$mu0
mu1 <- constants$mu1
mu2 <- constants$mu2
w.constant <- constants$w.constant
sigma <- constants$sigma
a <- constants$a
t.low <- constants$t.low
t.high <- constants$t.high
r <- constants$r
## set w-value
w.set <- get.w.set(w,mu1,covariate.dependent)
out <- F.exp(t,w=w.set,z,mu0,mu1,mu2,sigma,r,w.constant)/
normalizing.constant(t.low,t.high,w=w.set,z,mu0,mu1,mu2,sigma,r,a,w.constant)
return(out)
}
F2 <- function(t,w,z,covariate.dependent){
constants <- F.constants(2,covariate.dependent)
mu0 <- constants$mu0
mu1 <- constants$mu1
mu2 <- constants$mu2
w.constant <- constants$w.constant
sigma <- constants$sigma
a <- constants$a
t.low <- constants$t.low
t.high <- constants$t.high
r <- constants$r
## set w-value
w.set <- get.w.set(w,mu1,covariate.dependent)
if(t<=t.low){
out <- a * t
} else {
out <- intersection.constant(t.low,w=w.set,z,mu0,mu1,mu2,sigma,r,a,p=2,w.constant) + F.exp(t,w=w.set,z,mu0,mu1,mu2,sigma,r,w.constant)
}
out <- out/normalizing.constant(t.low,t.high,w=w.set,z,mu0,mu1,mu2,sigma,r,a,w.constant)
return(out)
}
## function to generate T from true F(t)
#' @import stats
trueinvFt <- function(p,w,z,simu.setting,covariate.dependent){
if(simu.setting =="Log-Normal-No-Covariates" | simu.setting=="Log-Normal-With-Covariates"){
out <- rep(0,p)
constant.z <- mult.z(covariate.dependent)
constant.w <- mult.w(covariate.dependent)
sd.use <- getsd(p)
for(jj in 1:p){
## Model: log(T)=W+0.5Z+N, so T=exp(W+0.5Z+N)
out[jj] <- exp(constant.w * w + constant.z * z + rnorm(1,0,sd.use[jj]))
}
} else if(simu.setting=="HD-No-Covariates" | simu.setting=="HD-With-Covariates"){
out <- rep(0,p) ## p=2 (fixed)
## get constants needed
constants.F1 <- F.constants(1,covariate.dependent)
mu0.F1 <- constants.F1$mu0
mu1.F1 <- constants.F1$mu1
mu2.F1 <- constants.F1$mu2
w.constant.F1 <- constants.F1$w.constant
sigma.F1 <- constants.F1$sigma
a.F1 <- constants.F1$a
t.low.F1 <- constants.F1$t.low
t.high.F1 <- constants.F1$t.high
r.F1 <- constants.F1$r
## set w-value
w.set.F1 <- get.w.set(w,mu1.F1,covariate.dependent)
use.value <- FALSE
while(use.value==FALSE){
u <- runif(1)
if( log.limit(u,t.low.F1,t.high.F1,w=w.set.F1,z,mu0.F1,
mu1.F1,mu2.F1,sigma.F1,r.F1,a.F1,p=1,w.constant.F1) > 0 ){
use.value <- TRUE
}
}
out[1] <- time.generate(u,t.low.F1,t.high.F1,w=w.set.F1,z,mu0.F1,
mu1.F1,mu2.F1,sigma.F1,r.F1,a.F1,p=1,w.constant.F1)
## get constants needed
constants.F2 <- F.constants(2,covariate.dependent)
mu0.F2 <- constants.F2$mu0
mu1.F2 <- constants.F2$mu1
mu2.F2 <- constants.F2$mu2
w.constant.F2 <- constants.F2$w.constant
sigma.F2 <- constants.F2$sigma
a.F2 <- constants.F2$a
t.low.F2 <- constants.F2$t.low
t.high.F2 <- constants.F2$t.high
r.F2 <- constants.F2$r
## set w-value
w.set.F2 <- get.w.set(w,mu1.F2,covariate.dependent)
use.value <- FALSE
while(use.value==FALSE){
u <- runif(1)
if(u < a.F2 * t.low.F2 / normalizing.constant(t.low.F2,
t.high.F2,w=w.set.F2,z,mu0.F2,mu1.F2,mu2.F2,sigma.F2,r.F2,
a.F2,w.constant.F2) |
log.limit(u,t.low.F2,t.high.F2,w=w.set.F2,z,mu0.F2,
mu1.F2,mu2.F2,sigma.F2,r.F2,a.F2,p=2,w.constant.F2) > 0 ) {
use.value <- TRUE
}
}
if(u < a.F2 * t.low.F2 / normalizing.constant(t.low.F2,
t.high.F2,w=w.set.F2,z,mu0.F2,mu1.F2,mu2.F2,sigma.F2,r.F2,
a.F2,w.constant.F2)){
out[2] <- u * normalizing.constant(t.low.F2,
t.high.F2,w=w.set.F2,z,mu0.F2,mu1.F2,mu2.F2,sigma.F2,
r.F2,a.F2,w.constant.F2)/ a.F2
} else {
out[2] <- time.generate(u,t.low.F2,t.high.F2,w=w.set.F2,z,mu0.F2,
mu1.F2,mu2.F2,sigma.F2,r.F2,a.F2,p=2,w.constant.F2)
}
}
return(out)
}
################################
## Function to generate (z,w) ##
################################
## function to get w-distribution
w.sample <- function(n){
cag.values <- seq(30,60,by=1)
out <- sample(x=cag.values,n,replace=TRUE,prob=rep(1,length(cag.values))/length(cag.values))
return(out)
}
#' @import stats
gendata.zw <- function(n,simu.setting){
zz <- rbinom(n,1,0.5)
if(simu.setting=="Log-Normal-No-Covariates" | simu.setting=="Log-Normal-With-Covariates"){
ww <- runif(n,0,1)
} else if(simu.setting=="HD-No-Covariates" | simu.setting=="HD-With-Covariates"){
ww <- w.sample(n)
##ww <- runif(n,30,60)
}
## setting without covariates: set z=w=0
##zz <- rep(0,n)
##ww <- rep(0,n)
list(zz=zz,ww=ww)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.