R/generate_data_202203.R

Defines functions generate_Zheng_data_survival_202203

library(dplyr)
library(purrr)

# sample_size <- 100  # n
# tau <- 4  # last time point
# seed <- 202008  # random seed

generate_Zheng_data_survival_202203 <- function(sample_size, tau, seed = NULL, setAM = NULL, if_LY_misspec = F, if_A_misspec = F, event_label = 1,
                                                if_competing_risk = F, if_omit_censoring = F) {
    if (is.null(seed)) {} else set.seed(seed)

    # time point 0
    t <- 0
    L01 <- rbinom(n = sample_size, size = 1, prob = 0.4)
    L02 <- rbinom(n = sample_size, size = 1, prob = 0.6)
    temp_data <- list()
    temp_data[[1]] <- data.frame(L1 = L01, L2 = L02)
    t <- t + 1  # note that t 0 value is associated with the first time point; t is actually the t-1 time point in the paper

    if(is.null(setAM)) {
        # time point 1~tau
        # t = 1, ..., tau; to update the t-th time point in the t+1 slot
        while(t <= tau) {
            if (!if_A_misspec) {
                temp_A_C <- map_dbl(.x = expit(1.5 - 0.8*L02 - 0.4*ifelse_vec(t > 1, temp_data[[t]]$L1, L01) + 0.5*ifelse_vec(t>1, temp_data[[t]]$A_E, 0)),
                                    .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                if(if_omit_censoring) temp_A_C <- 1
                temp_A_E <- map_dbl(.x = expit(- 0.1 + 1.2*L02 + 0.7*ifelse_vec(t > 1, temp_data[[t]]$L1, L01) - 0.1*ifelse_vec(t>1, temp_data[[t]]$A_E, 0)),
                                    .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
            } else {  # all quadratic
                temp_A_C <- map_dbl(.x = expit(1.5 - 0.8*L02^2 - 0.4*ifelse_vec(t > 1, temp_data[[t]]$L1, L01)^2 + 0.5*ifelse_vec(t>1, temp_data[[t]]$A_E, 0)^2),
                                    .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                if(if_omit_censoring) temp_A_C <- 1
                temp_A_E <- map_dbl(.x = expit(- 0.1 + 1.2*L02^2 + 0.7*ifelse_vec(t > 1, temp_data[[t]]$L1, L01)^2 - 0.1*ifelse_vec(t>1, temp_data[[t]]$A_E^2, 0)^2),
                                    .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
            }

            temp_R <- map_dbl(.x = expit(- 0.8 + temp_A_E + 0.1*ifelse_vec(t > 1, temp_data[[t]]$L1, L01) + 0.3*ifelse_vec(t>1, temp_data[[t]]$R, L02)),
                              .f = ~ rbinom(n = 1, size = 1, prob = .x)
            )
            temp_Z <- map_dbl(.x = expit(- 0.5 + 0.8*L02 + 0.8*temp_A_E + temp_R),
                              .f = ~ rbinom(n = 1, size = 1, prob = .x)
            )

            if (!if_LY_misspec) {
                # default, correct data
                temp_L <- map_dbl(.x = expit(- 1 + 0.3*L02 + temp_A_E + 0.7*temp_Z - 0.2*ifelse_vec(t>1, temp_data[[t]]$L1, 0)),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                temp_Y <- map_dbl(.x = expit((0.2 + 1.5*L02 + temp_R + 0.2*temp_L - 0.3*temp_A_E - 0.3*temp_Z
                                             # - 0.2*temp_A_E*temp_Z
                                             - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0))/4 ),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                # temp_Y <- 1-temp_Y
                if (if_competing_risk) {
                    temp_Y2 <- map_dbl(.x = expit(-0.5 + 0.1*temp_R + 0.1*temp_L - 0.1*temp_A_E - 0.1*temp_Z
                                                  - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0) ),
                                       .f = ~ rbinom(n = 1, size = 1, prob = .x)
                    )
                }
            } else {
                # LY misspec with squares
                # temp_L <- map_dbl(.x = scale_bounded_relu(- 1 + 0.3*L02 + temp_A_E^2 + 0.7*temp_Z^2 - 0.2*ifelse_vec(t>1, temp_data[[t]]$L1, 0), upper = 4),
                #                   .f = ~ rbinom(n = 1, size = 1, prob = .x)
                # )
                # temp_Y <- map_dbl(.x = scale_bounded_relu(0.2 + 1.5*L02 + temp_R + 0.2*temp_L - 0.3*temp_A_E - 0.3*temp_Z
                #                                           - 0.2*temp_A_E*temp_Z
                #                                           - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0), upper = 4),
                #                   .f = ~ rbinom(n = 1, size = 1, prob = .x)
                # )
                # temp_Y <- 1-temp_Y
                temp_L <- map_dbl(.x = expit(- 1 + 0.3*L02^2 + temp_A_E^2 + 0.7*temp_Z^2 - 0.2*ifelse_vec(t>1, temp_data[[t]]$L1, 0)^2),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                temp_Y <- map_dbl(.x = expit((0.2 + 1.5*L02^2 + temp_R^2 + 0.2*temp_L^2 - 0.3*temp_A_E^2 - 0.3*temp_Z^2
                                             # - 0.2*temp_A_E*temp_Z^2
                                             - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0)^2)/4 ),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                # temp_Y <- 1-temp_Y
                if (if_competing_risk) {
                    temp_Y2 <- map_dbl(.x = expit(-0.5 + 0.1*temp_R^2 + 0.1*temp_L^2 - 0.1*temp_A_E^2 - 0.1*temp_Z^2
                                                  - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0)^2 ),
                                       .f = ~ rbinom(n = 1, size = 1, prob = .x)
                    )
                }

            }
            temp_data[[t + 1]] <- data.frame(A_C = temp_A_C,
                                             A_E = temp_A_E,
                                             R = temp_R,
                                             Z = temp_Z,
                                             L1 = temp_L,
                                             Y = temp_Y
            )
            if (if_competing_risk){
                temp_Y2[which(temp_Y == 1)] <- 0  # if dead, cannot have competing event Y2
                temp_data[[t + 1]]$Y2 <- temp_Y2
            }
            t <- t + 1
        }
    } else {
        # for mediation targets
        # time point 1~tau
        # t = 1, ..., tau; to update the t-th time point in the t+1 slot
        while(t <= tau) {
            # omit censoring for now
            temp_A_E <- rep(setAM[1], sample_size)
            temp_A_C <- 1
            temp_R <- map_dbl(.x = expit(- 0.8 + temp_A_E + 0.1*ifelse_vec(t > 1, temp_data[[t]]$L1, L01) + 0.3*ifelse_vec(t>1, temp_data[[t]]$R, L02)),
                              .f = ~ rbinom(n = 1, size = 1, prob = .x)
            )
            temp_Z <- map_dbl(.x = expit(- 0.5 + 0.8*L02 + 0.8*rep(setAM[2], sample_size) + temp_R),
                              .f = ~ rbinom(n = 1, size = 1, prob = .x)
            )
            if (!if_LY_misspec) {
                # default, correct data
                temp_L <- map_dbl(.x = expit(- 1 + 0.3*L02 + temp_A_E + 0.7*temp_Z - 0.2*ifelse_vec(t>1, temp_data[[t]]$L1, 0)),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                temp_Y <- map_dbl(.x = expit((0.2 + 1.5*L02 + temp_R + 0.2*temp_L - 0.3*temp_A_E - 0.3*temp_Z
                                             # - 0.2*temp_A_E*temp_Z
                                             - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0))/4 ),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                # temp_Y <- 1-temp_Y
                if (if_competing_risk) {
                    temp_Y[temp_data[[t]]$Y2 == 1] <- 0  # remove cause specific death if there has been a competing condition
                    temp_Y2 <- map_dbl(.x = expit(-1.5 + 0.1*temp_R + 0.1*temp_L - 0.1*temp_A_E - 0.1*temp_Z
                                                  - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0) ),
                                       .f = ~ rbinom(n = 1, size = 1, prob = .x)
                    )
                }
            } else {
                # LY misspec
                # temp_L <- map_dbl(.x = scale_bounded_relu(- 1 + 0.3*L02 + temp_A_E^2 + 0.7*temp_Z^2 - 0.2*ifelse_vec(t>1, temp_data[[t]]$L1, 0), upper = 4),
                #                   .f = ~ rbinom(n = 1, size = 1, prob = .x)
                # )
                # temp_Y <- map_dbl(.x = scale_bounded_relu(0.2 + 1.5*L02 + temp_R + 0.2*temp_L - 0.3*temp_A_E - 0.3*temp_Z
                #                                           - 0.2*temp_A_E*temp_Z
                #                                           - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0), upper = 4),
                #                   .f = ~ rbinom(n = 1, size = 1, prob = .x)
                # )
                # temp_Y <- 1-temp_Y
                temp_L <- map_dbl(.x = expit(- 1 + 0.3*L02^2 + temp_A_E^2 + 0.7*temp_Z^2 - 0.2*ifelse_vec(t>1, temp_data[[t]]$L1, 0)^2),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                temp_Y <- map_dbl(.x = expit((0.2 + 1.5*L02^2 + temp_R^2 + 0.2*temp_L^2 - 0.3*temp_A_E^2 - 0.3*temp_Z^2
                                             # - 0.2*temp_A_E*temp_Z^2
                                             - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0)^2)/4 ),
                                  .f = ~ rbinom(n = 1, size = 1, prob = .x)
                )
                # temp_Y <- 1-temp_Y
                if (if_competing_risk) {
                    temp_Y[temp_data[[t]]$Y2 == 1] <- 0  # remove cause specific death if there has been a competing condition
                    temp_Y2 <- map_dbl(.x = expit(-1.5 + 0.1*temp_R^2 + 0.1*temp_L^2 - 0.1*temp_A_E^2 - 0.1*temp_Z^2
                                                  - 0.1*ifelse_vec(t>1, temp_data[[t]]$R, 0)^2 ),
                                       .f = ~ rbinom(n = 1, size = 1, prob = .x)
                    )
                }
            }

            temp_data[[t + 1]] <- data.frame(A_C = temp_A_C,
                                             A_E = temp_A_E,
                                             R = temp_R,
                                             Z = temp_Z,
                                             L1 = temp_L,
                                             Y = temp_Y
            )
            if (if_competing_risk){
                temp_Y2[which(temp_Y == 1)] <- 0  # if dead, cannot have competing event Y2
                temp_data[[t + 1]]$Y2 <- temp_Y2
            }
            t <- t + 1
        }
    }

    for (t in 1:tau) {
        temp_if_censored <- temp_data[[t+1]]$A_C == 0
        for (s in t:tau) {
            for (l in 2:ncol(temp_data[[s+1]])) temp_data[[s+1]][[l]][temp_if_censored] <- NA
            temp_data[[s+1]][[1]][temp_if_censored] <- ifelse(s == t, 0, NA)  # censored at the time point t; delete after t
        }
    }
    if (tau > 1) {
        for (t in 1:(tau-1)) {
            temp_if_dead <- temp_data[[t+1]]$Y == event_label & (!is.na(temp_data[[t+1]]$Y))
            if (if_competing_risk) temp_if_compete <- temp_data[[t+1]]$Y2 == event_label & (!is.na(temp_data[[t+1]]$Y2))
            for (s in (t+1):tau) for (l in 1:ncol(temp_data[[s+1]])) {
                if (colnames(temp_data[[s+1]])[l] == "Y"  ) {
                    temp_data[[s+1]][[l]][temp_if_dead] <- 1  # remain dead after death
                } else if (colnames(temp_data[[s+1]])[l] == "Y2") {
                    temp_data[[s+1]][[l]][temp_if_dead] <- 0  # remain no competing condition if dead
                }  else {
                    temp_data[[s+1]][[l]][temp_if_dead] <- NA  # dead at t, delete after t (including censoring node)
                }
                # remain no death if there has been a competing condition
                if (if_competing_risk) {
                    if (colnames(temp_data[[s+1]])[l] == "Y"  ) {
                        temp_data[[s+1]][[l]][temp_if_compete] <- 0  # remain no death after compete
                    } else if (colnames(temp_data[[s+1]])[l] == "Y2") {
                        temp_data[[s+1]][[l]][temp_if_compete] <- 1  # remain compete if already
                    }  else {
                        temp_data[[s+1]][[l]][temp_if_compete] <- NA  # compete at t, delete after t (including censoring node)
                    }
                }

            }
        }
    }

    for (s in 1:(tau + 1)) {
        colnames(temp_data[[s]]) <- paste0(colnames(temp_data[[s]]), "_", s-1)
    }

    return(temp_data)
}


# generate_Zheng_data_survival_202112(sample_size = 100, tau = 2, seed = 123, setAM = NULL, if_LY_misspec = F, if_A_misspec = F, event_label = 1) %>% lapply(function(x) head(x, 10))
# generate_Zheng_data_survival_202112(sample_size = 100, tau = 2, seed = 123, setAM = NULL, if_LY_misspec = F, if_A_misspec = F, event_label = 1)
# generate_Zheng_data_survival_202112(sample_size = 100000, tau = 2, seed = 123, setAM = c(1, 0), if_LY_misspec = F, if_A_misspec = F, event_label = 1)[[2]]$Y_1 %>% table(useNA = "always")
# generate_Zheng_data_survival_202112(sample_size = 100000, tau = 2, seed = 123, setAM = c(1, 0), if_LY_misspec = F, if_A_misspec = F, event_label = 1)[[3]]$Y_2 %>% table(useNA = "always")
# generate_Zheng_data_survival_202112(sample_size = 100000, tau = 3, seed = 123, setAM = c(1, 0), if_LY_misspec = F, if_A_misspec = F, event_label = 1)[[4]]$Y_3 %>% table(useNA = "always")



# generate_Zheng_data_survival_202112(sample_size = 100, tau = 2, seed = 123, setAM = NULL, if_LY_misspec = F, if_A_misspec = F, event_label = 1, if_competing_risk = T) %>% lapply(function(x) head(x, 10))
# generate_Zheng_data_survival_202112(sample_size = 100, tau = 2, seed = 123, setAM = NULL, if_LY_misspec = F, if_A_misspec = F, event_label = 1, if_competing_risk = T)
# generate_Zheng_data_survival_202112(sample_size = 100000, tau = 2, seed = 123, setAM = c(1, 0), if_LY_misspec = F, if_A_misspec = F, event_label = 1, if_competing_risk = T)[[3]]$Y_2 %>% table(useNA = "always")
# generate_Zheng_data_survival_202112(sample_size = 100000, tau = 2, seed = 123, setAM = c(1, 0), if_LY_misspec = F, if_A_misspec = F, event_label = 1, if_competing_risk = T)[[3]]$Y2_2 %>% table(useNA = "always")
zy-wang1/calm documentation built on July 30, 2024, 10:51 a.m.