Nothing
#' Functions for Simulating Data
#' @import MASS
#'
#' @description When investigating the properties of GEM, the following three data generators are used in various simulations.
#' They are designed to construct three specific types of data sets in the case of two treatment groups. See more detail in
#' \cite{E Petkova, T Tarpey, Z Su, and RT Ogden. Generated effect modifiers (GEMs) in randomized clinical trials. Biostatistics, (First published online: July 27, 2016). doi: 10.1093/biostatistics/kxw035.}
#'
#'
#' @param d A scalar indicating the effect size of the GEM when the data is generated under a GEM model
#' @param R2 A scalar indicating the proportion of explained variance \eqn{R^2} for the entire data set
#' @param v2 A scalar indicating the proportion of explained variance \eqn{R^2} for the first treatment group
#' @param n A scalar indicating the number of observation in each treatment group, assumed to be the same.
#' @param co A \emph{p} by \emph{p} positive semidefinite matrix indicating the covariance matrix of the covariates
#' @param beta1 A vector of length \emph{p} giving the regression coefficients for the first treatment group
#' @param bet A list with two elements, each a vector of length \emph{p}, giving the regression coefficients for the two treatment groups respectively
#' @param inter A vector of length 2 recording the intercepts \eqn{\beta_{10},\beta_{20}} for the two treatment groups respectively
#'
#' @details \code{data_generator1} is used to create data where the outcome is a linear function of the covariates \deqn{y_j = \beta_{j0} + X\beta_j + \epsilon, j = 1, 2, }
#' and the coffcicients of covariates \eqn{\beta} are proportional between two treatment groups: \eqn{\beta_2 = b * \beta_1}.
#' This type of data set matches perfectly with the motivation of GEM algorithm. \eqn{\beta_1} is set as an argument of the function while \eqn{\beta_2 = b * \beta_1}
#' is derived by controling \eqn{R^2} of the whole data and the effect size. See more detail in \cite{Kraemer, H. C. (2013). Discovering, comparing, and combining moderators of treatment
#' on outcome after randomized clinical trials: a parametric approach. Statistics in medicine, 32(11), 1964-1973.}
#'
#' \code{data_generator2} is similar to the first one except that the coefficients of the covariates are not necessarily proportional. Hence two \eqn{\bold{\beta}}'s
#' should be specified as arguments of the function.
#'
#' \code{data_generator3} constructs a data set where the outcome under each treatment condition is given for all subjects. In addition, no error is added to the mean outcome.
#' This generator is useful for obtaining the "true" value of a treatment decision. This data generator is similar to data generator2 \deqn{y_j = \beta_{j0} + X\beta_j, j = 1,2.}
#'
#' @return The output from these functions are different:
#'
#' For the function \code{data_generator1}
#' \enumerate{
#' \item \code{dat} A data frame with first and second column as treatment group index and outcome respectively,
#' and each of the remaining columns as a covariate.
#' \item \code{bet} A list with two elements, each a vector of length \eqn{p}, giving the regression coefficients for the two treatment groups respectively
#' \item \code{error_12} A vector of length three represeting the standard deviation of \eqn{\epsilon}, the explained variance by the linear part for the first
#' and second treatment group respectively.
#' }
#'
#' For the function \code{data_generator2}
#' \enumerate{
#' \item \code{dat} A data frame with first and second column as treatment group index and outcome respectively,
#' and each of the remaining columns as a covariate.
#' \item \code{bet} list with two elements, each a vector of length \eqn{p}, giving the regression coefficients for the two treatment groups respectively
#' \item \code{error} A scalar represeting the standard deviation of \eqn{\epsilon}
#' }
#'
#' For the function \code{data_generator3}
#' \enumerate{
#' \item \code{y0} Outcome vector under the first treatment assignment
#' \item \code{y1} Outcome vector under the second treatment assignment
#' \item \code{X} Design matrix for the covariates
#' \item \code{oracle} Average of the outcome if each subject takes the optimal treatment assignment
#' \item \code{invOracle} Average of the outcome if each subject does not take the optimal treatment assignment
#' }
#' @examples
#' #constructing the covariance matrix
#' co <- matrix(0.2, 30, 30)
#' diag(co) <- 1
#' dataEx <- data_generator1(d = 0.3, R2 = 0.5, v2 = 1, n = 3000,
#' co = co, beta1 = rep(1,30),inter = c(0,0))
#' #check the R squared of the simluated data set
#' dat <- dataEx[[1]]
#' summary(lm(V2~factor(trt)*(V3+V4+V5+V6+V7+V8+V9+V10+V11+V12+V13+V14+V15+V16+
#' V17+V18+V19+V20+V21+V22+V23+V24+V25+V26+V27+V28+V29+V30+V31+V32),data=dat))
#'
#' bigData <- data_generator3(n = 10000,co = co,bet =dataEx[[2]], inter = c(0,0))
#' @name data_generator
#' @export
data_generator1 <- function(d, #effect size for the data set
R2, # r square for the data set
v2, #the SSR for treatment group 1
n, # number of observation for each treatment group
co, # here co is a matrix rather than a list, which assume that all treatment group's design matrix have the same covariance
beta1, #the coefficient of beta1
inter) #the intercept for each treatment group
{
a2 = a0 = 2*d^2+(2-2*R2)*v2*d^2/R2 -1
a1 = 2
b = (-a1 + sqrt(a1^2 - 4*a0*a2))/2/a2
s = 0.5*(1+b^2)*v2*(1-R2)/R2
temp <- v2/(t(beta1) %*% co %*% beta1)
bet <- vector("list",2)#Here we create this list for the mod3 below
beta1 <- beta1 * sqrt(temp)
beta2 <- beta1 * b
bet[[1]] <- beta1
bet[[2]] <- beta2
X <- vector("list",2)#Here we create this list for the mod3 below
X[[1]] <- x1 <- mvrnorm(n,rep(0,ncol(co)),co)
X[[2]] <- x2 <- mvrnorm(n,rep(0,ncol(co)),co)
Y <- vector("list",2)#Here we create this list for the mod3 below
Y[[1]] <- y1 <- x1 %*% beta1 + matrix(rnorm(n)*sqrt(s),n,1) + inter[1]
Y[[2]] <- y2 <- x2 %*% beta2 + matrix(rnorm(n)*sqrt(s),n,1) + inter[2]
trt1<- matrix(0,n,1)
trt2<- matrix(1,n,1)
dat1 <- as.matrix(cbind(y1,x1))
dat2 <- as.matrix(cbind(y2,x2))
dat <- as.data.frame(rbind(cbind(trt1,dat1), cbind(trt2,dat2)))
colnames(dat)[1] <- "trt"
results <- list("dat" = dat,
"bet" = bet,
"error_12" = c(sqrt(s),t(beta1) %*% co %*% beta1,t(beta2) %*% co %*% beta2))
return(results)
}
#' @name data_generator
#' @export
data_generator2 <- function(n, #A number specifying the number of observation for each group
co, #co is the covariate covariance that is identical between treatment groups
R2, #A scalar specifying the proportion of variance explained (R^2)
#by the predictors, same in all groups
bet, #A two element list of legnth p vectors recording covariate coefficients for two treatment groups respectively
inter) #Vector of treatment groups' intercepts
{
bet1 <- as.matrix(bet[[1]])
bet2 <- as.matrix(bet[[2]])
X <- vector("list",2)#Here we create this list for the mod3 below
X[[1]] <- x1 <- mvrnorm(n,rep(0,ncol(co)),co)
X[[2]] <- x2 <- mvrnorm(n,rep(0,ncol(co)),co)
s2 <- 0.5 * (1/R2-1) * (t(bet1) %*% co %*% bet1 + t(bet2) %*% co %*% bet2)
y1 <- x1 %*% bet1 + matrix(rnorm(n)*sqrt(s2),n,1) + inter[1]
y2 <- x2 %*% bet2 + matrix(rnorm(n)*sqrt(s2),n,1) + inter[2]
trt1<- matrix(0,n,1)
trt2<- matrix(1,n,1)
dat1 <- as.matrix(cbind(y1,x1))
dat2 <- as.matrix(cbind(y2,x2))
dat <- as.data.frame(rbind(cbind(trt1,dat1), cbind(trt2,dat2)))
colnames(dat)[1] <- "trt"
results <- list("dat" = dat,
"bet" = bet,
"error" = c(sqrt(s2)))
return(results)
}
#' @name data_generator
#' @export
data_generator3 <- function(n, #the number of oberservations
co, #co is the covariate covariance that is identical between treatment groups
bet, #A two element list of legnth p vectors recording covariate coefficients for two treatment groups respectively
inter) #Vector of treatment groups' intercepts
{
p <- ncol(co) #The number of predictors
bet0 <- bet[[1]]
bet1 <- bet[[2]]
X <- as.matrix(mvrnorm(n,rep(0,ncol(co)),co))
y0 <- as.matrix(X %*% bet0 + inter[1])
y1 <- as.matrix(X %*% bet1 + inter[2])
optTrt <- y0<=y1
oracle <- sum(y0*(1-optTrt)+y1*optTrt)/length(optTrt)
inv_oracle <- sum(y0*optTrt+y1*(1-optTrt))/length(optTrt)
results <- list("y0" = y0, #1
"y1" = y1, #2
"X" = X, #3
"oracle"=oracle, #4
"invOracle" = inv_oracle) #5
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.