R/data_generation.R

Defines functions data_gen

Documented in data_gen

#' @title Generation of Artificial Data
#'
#' @description The function generates a set of artificial data, including covariates generated by uniform
#' distribution with an interval \code{[0.5, 0.5]}, survival time and censoring status with measurement error and misclassifications.
#'  In this function, users can specify different degrees of measurement
#'  error that links observed survival time with true survival time, and links observed
#'  censoring status with true censoring status. Moreover, the accelerated functional failure time model considered in
#'  function is given by \code{T=f(X1)+f(X2)+f(X3)+f(X4)+error}, where \code{T} is log failure time and \code{f(X1)=4*x1^2+x1},
#'  \code{f(X2)=sin(6*x2)},\code{f(X3)=cos(6*x3)-1} and \code{f(X4)=4*x4^3+x4^2}.
#'
#'
#' @param n Sample size.
#' @param p The number of covariates.
#' @param pi_01 Misclassifcation probability is P(Observed Censoring Status = 0 | Actual Censoring Status = 1).
#' @param pi_10 Misclassifcation probability is P(Observed Censoring Status = 1 | Actual
#'  Censoring Status = 0).
#' @param gamma0 A scalar that links the observed survival time and true survival time in
#'  the classical additive measurement error model \code{y*=y+gamma0+gamma1*X+v}, where y* is observed survival time and \code{y} is true survival time, and \code{x} is covariates and v is noise term.
#' @param gamma1 A \code{p}-dimensional vector of parameters in the
#'  additive measurement error model \code{y*=y+gamma0+gamma1*X+v}, where \code{y*}
#'  is observed survival time and \code{y} is true survival time, \code{x} is covariates and \code{v} is
#'  noise term.
#' @param e_var The variance of noise term \code{v} in the additive measurement
#'  error model \code{y*=y+gamma0+gamma1*X+v}, where \code{v} is assumed to follow a normal
#'   distribution.
#'
#' @return generated_data \code{c(n,p+2)} dimensional data frame. The first column is observed survival time and
#'  second column is observed censoring status, and the other columns are covariates.
#'
#' @examples
#' ## Set the relationship between observed survival time
#' ## and true survival time equals y*= y+1+X1+v, where the variance is
#' ## 0.75 with n=500 and p=50 and misclassification probability=0.9.
#'
#' a <- matrix(0,ncol=50, nrow = 1); a[1,1] <- 1
#' data <- data_gen(n=500, p=50, pi_01=0.9, pi_10 = 0.9, gamma0=1,
#' gamma1=a, e_var=0.75)
#' @export
#' @importFrom stats "rnorm" "runif"
#'
#'

data_gen <- function(n,p,pi_01,pi_10,gamma0,gamma1,e_var){
  f1 <- function(x1){
    y <- 4*x1^2+x1
    return(y)
  }


  f2 <- function(x2){
    y <- sin(6*x2)
    return(y)
  }


  f3 <- function(x3){
    y <- cos(6*x3)-1
    return(y)
  }


  f4 <- function(x4){
    y <- 4*x4^3+x4^2
    return(y)
  }

  t<- c()
  e <- c()
  d <- c()
  covariates <- c()
  dim = p
  n = n

  for (i in c(1:n)){
    p <- runif(dim,-0.5,0.5)
    error <- rnorm(1,0,1)
    a <- f1(p[1])+f2(p[2])+f3(p[3])+f4(p[4])+error
    t[i] <- a
    e[i] <- error
    d <- t(data.frame(p))
    covariates <- rbind(covariates,d)
  }


  censoring_probability_n <- c()
  coefficients <-matrix(runif(dim,-5,5),nrow = 1)
  for(i in c(1:dim(covariates)[1])){
    variables <- t(matrix(covariates[i,],nrow = 1))
    df4 <- coefficients%*%variables
    censoring_probability <- (exp(df4))/(1+exp(df4))
    censoring_probability_n <- c(censoring_probability_n,censoring_probability)
  }

  censoring_indicator <- (censoring_probability_n >=0.5)*1

  y <- c()
  for (i in c(1:length(censoring_indicator))){
    if (censoring_indicator[i]==1){
      y <- c(y,t[i])
    }else{
      y <- c(y,t[i]-exp(0.003))
    }
  }

  df_for_y_with_measurement_error <- c()
  v <- data.frame(rnorm(length(censoring_indicator),0,e_var))
  constant <- data.frame(rep(gamma0,length(censoring_indicator)))


  b_cov <- c()
  for (i in 1:dim(covariates)[1]){
    b_cov[i] <- gamma1 %*% covariates[i,]
  }

  df_for_y_with_measurement_error <- cbind(y,constant,v,b_cov)
  colnames(df_for_y_with_measurement_error)[2:3] <-c('constant','v')
  y_with_measurement_error <- data.frame(apply(df_for_y_with_measurement_error,1,sum))


  misclassification_proportion  <- matrix(c(1-pi_01,pi_01,pi_10,1-pi_10), ncol = 2)
  censoring_probability_indicator_1 <- censoring_probability_n
  censoring_probability_indicator_0 <- 1 - censoring_probability_n
  df5 <- cbind(censoring_probability_indicator_1,censoring_probability_indicator_0)
  censoring_indicator_with_measurement_error <- NULL


  for (i in c(1:length(censoring_indicator))){
    df6 <- misclassification_proportion %*%df5[i,]
    if (df6[1,]>=0.5){
      censoring_indicator_with_measurement_error <- c(censoring_indicator_with_measurement_error,1)
    }else{
      censoring_indicator_with_measurement_error <- c(censoring_indicator_with_measurement_error,0)
    }
  }

  generated_data <- cbind(y_with_measurement_error, censoring_indicator_with_measurement_error, covariates)
  colnames(generated_data)[1:2] <- c('failure time', 'censoring indicator')
  return(generated_data)
}

Try the AFFECT package in your browser

Any scripts or data that you put into this service are public.

AFFECT documentation built on July 9, 2023, 6:45 p.m.