#' @title Data generator function for extendAUG
#'
#' @param n size of full data: integer, greater than 1000 (i.e., 2,000, 5,000, 10,000)
#' @param m size of validation set (i.e., 100, 200, 400, & 800)
#'
#' @return validation matrix of x, y, s, and their values (for augmented_est function)
#' @import matlib MASS mice robustHD parallel
#' @importFrom mice complete
#' @importFrom wakefield r_sample_binary
#' @importFrom robustHD standardize
#' @importFrom stats qnorm rbinom
#' @importFrom utils stack
#' @export
data_generator <- function(n = 2000, m = 100){
result_list <- list()
se1 = 0.90 # sensitivity
sp1 = 0.95 # specificity
b0 = -1.0 ## lower the value is, rare the disease is exp(-1) = 0.3679 --> 36.79% paitents would have the disease
b1 = 0.5
b2 = 1
beta <- c(b0,b1,b2)
# beta <- c(b0,b1)
l = length(beta)
print(paste("data size is ", n,"; betas are ", b0,", ", b1, ", ", b2,"; m is ", m, sep = ""))
# to generate exposures
## binary exposure
x1 = r_sample_binary(n, x = 1:2, prob = NULL, name = "Binary") # --- Family history of breast or ovarian cancer Y/N
## conti exposure
x2 = sample(size = n, 12:80, replace = TRUE) # ---- age
# to standardize conti exposure
x2_temp <- standardize(x2)
# to combine exposures
x_temp = cbind(x1,x2_temp)
z = beta %*% t(cbind(1,x_temp))
pr = 1/(1+exp(-z)) # pass through an inv-logit function
# to generate outcome variable
y = rbinom(n,1,pr) # bernoulli response variable
alpha1 = se1 # sensitivity pr(s=1|y=1) for non-exposure group
alpha2 = sp1
pr_s = alpha1*(y==1) + (1-alpha2)*(y==0)
# to generate naive outcome variable
s = rbinom(n,1,pr_s)
## step I: missing
# missing 10% data in age > 50
ind_x2_g50 <- which(x2 >50)
lenNA_g50 <- length(ind_x2_g50)*0.1
ind_foo_g50 <- sample(ind_x2_g50, size = lenNA_g50)
x2[ind_foo_g50] <- NA
# missing 5% data age <= 50
ind_x2_l50 <- which(x2 <= 50)
lenNA_l50 <- length(ind_x2_l50)*0.05
ind_foo_l50 <- sample(ind_x2_l50, size = lenNA_l50)
x2[ind_foo_l50] <- NA
# mimic done
# ---------------------------
## step II: Imputing Missing Data
x_temp = cbind(x1,x2)
#summary(x_temp)
# Imputing Missing Data
tempDate <- mice(x_temp, printFlag = FALSE)
#summary(tempDate)
completedData <- complete(tempDate,1)
#summary(completedData)
x_temp <- completedData
id <- sample(1:n, m, replace = FALSE)
x <- cbind(1, x_temp)
#nr <- dim(x)[2]
x[,3] <- standardize(x[,3])
val_x = x[id,] # validation set of x
val_y = y[id] # validation set of y
val_s = s[id] # validation set of s
result_list <- list(x = x,s = s,y = y, val_x = val_x, val_s = val_s, val_y = val_y)
return(result_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.