R/simulateDatav02.R

Defines functions simulateMissDfYorX

Documented in simulateMissDfYorX

#--------Simulating data of size n------------------------
#' Simulate missing covariate or missing responses data based on an input covariate data
#' @description This function generates missing covariate or missing responses data. The missing data generation in the last two supplied covariates will be generated based on a predefined mechanisms. Missing data generation in the response variable will be based on the suppilied true alpha.
#' @param dataCov input data, the default number of covariates is 7 (5+2)
#' @param truebeta the beta parameter to be used to generate binary responses 1/0 s \code{logit(y=1)=x1+x2+x3}
#' @param truealpha to be used to generate nonignorable missing values based on the model \code{logit(R=1)=y+x1+x2+x3+x4+..}
#' @param x2Mar to be used to generate missing values in x2 based on the model \code{logit(x2=missing)=x1+y}
#' @param nsim number of simulated dataset, default is 2
#' @param ymiss to be used for missing responses, default is FALSE
#' @return returns a list with original data called originalData and a data with imputed missing values dataMissing
#' @export
#'
#' @examples
#' demo_df <- simulateCovariateData(100, nCov=6)
#' simulated_df <- simulateMissDfYorX(demo_df, nsim=2)
#' testMissData <- simulated_df$dataMissing
#' head(testMissData)
#'
#--------Simulating data of size n------------------------
simulateMissDfYorX <-function(dataCov, truebeta=c(1,-1,1,5), truealpha=c(-1,5, -1, -1, -1, 0.01), x2Mar=c(1,-1,-1), ymiss=FALSE,  nsim=1 ){

  # dataCov <- demo_df
  data <- as.matrix(dataCov[,1:(length(truebeta)-1)])
  data <- cbind(int=matrix(1, nrow(data)), data)
  colnames(data)[1] <- "intercept"

  l2=data %*% truebeta
  p.1=1/(1+exp(-l2))

  #---------creating missing values----------------------------------------
  simulatedMissData <- data.frame()
  originalDataSim <- data.frame()
  for (i in 1:nsim){
  y <- matrix(data = sapply(p.1, function(p) rbinom(n = 1, size = 1, prob = p)), ncol = 1)
  originalData <- data.frame(y, dataCov)
  # head(originalData)
  tt.orig=data.frame(y, data[,-1]) #removing the intercept
  temp_df <- dataCov


    # temp_df <- dataCov
    #---------creating missing values----------------------------------------
    # Determine the total number of columns in 'data'
    num_cols <- ncol(data)

  if (ymiss==TRUE){
    #----------Creating probabilities of creating R --------------------------
    dataForalpha <- cbind(int=matrix(1, nrow(tt.orig)),originalData)[,1:length(truealpha)]
    l2=as.matrix(dataForalpha) %*% truealpha
    p.r=1/(1+exp(-l2))

    R <- sapply(p.r, function(p) rbinom(n = 1, size = 1, prob = p))
    # Convert to matrix if needed
    R <- matrix(R, ncol = 1)
    yTemp=ifelse(R==1,NA,y)
    #creating a data with missing values
    tt.miss=data.frame(yTemp, temp_df,R)
  }
else{
  dataForalpha <- cbind(int=matrix(1, nrow(tt.orig)),originalData)[,1:length(truealpha)]
  # --creating prob for generating missing values in x2---dataForalpha[,c(3,6)] is picking up x1 and x4 -
  temp <- cbind(int=matrix(1, nrow(data)), dataForalpha[,c(3,2)])
  l3=as.matrix(temp) %*% x2Mar
  p3=1/(1+exp(-l3))

  # ---creating missing flag --------------------------------------------------
  mf3=matrix(data = sapply(p3, function(p) rbinom(n = 1, size = 1, prob = p)), ncol = 1)
  # table(mf3)
  rows2createna <- which(mf3 == 1)
  temp_df[rows2createna, colnames(dataCov)[2] ] <- NA

  # --creating prob for generating missing values in x3 --dataForalpha[,c(3,4)] is picking up x1 and x2 --
  temp <- cbind(int=matrix(1, nrow(data)), dataForalpha[,c(3)],mf3, y)
  l4=as.matrix(temp) %*% c(1,-1, 1,-2)
  p4=1/(1+exp(-l4))

  # ---creating missing flag --------------------------------------------------
  mf4=matrix(data = sapply(p4, function(p) rbinom(n = 1, size = 1, prob = p)), ncol = 1)
  # table(mf4)
  rows2createna <- which(mf4 == 1)
  temp_df[rows2createna, colnames(dataCov)[3]] <- NA
  # sum(is.na(temp_df))
  tt.miss=data.frame(y,temp_df)
}


    tt.miss$nsim <- paste0("Sim", i)
    simulatedMissData <- rbind(simulatedMissData,tt.miss)

    originalDataSimDf <- originalData
    originalDataSimDf$nsim <- paste0("Sim", i)
    originalDataSim <- rbind(originalDataSim,originalDataSimDf)
  }
  colnames(simulatedMissData)[1] <- "y"
  #---dataOriginal is the original data with no missing values, dataMissing is the data
  #---with missing y and X's. Only the variable soc and cond has missing values
  return(list(dataOriginal=originalDataSim, dataMissing=simulatedMissData))

}

Try the glmfitmiss package in your browser

Any scripts or data that you put into this service are public.

glmfitmiss documentation built on June 8, 2025, 1:59 p.m.