R/missing_data.r

Defines functions mar_missing random_missing dropout_locator dropout_helper dropout_missing generate_missing missing_data

Documented in dropout_missing generate_missing mar_missing missing_data random_missing

#' Missing Data Functions
#' 
#' Function that inputs simulated data and returns data frame with 
#' new response variable that includes missing data. Missing data types
#' incorporated include dropout missing data, missing at random, and 
#' random missing data.
#' 
#' @param sim_data Simulated data frame
#' @param resp_var Character string of response variable with complete data.
#' @param new_outcome Character string of new outcome variable name that includes
#'   the missing data.
#' @param clust_var Cluster variable used for the grouping, set to 
#'           NULL by default which means no clustering.
#' @param within_id ID variable within each cluster.
#' @param miss_prop Proportion of missing data overall 
#' @param dropout_location A vector the same length as the number of clusters 
#'   representing the number of data observations for each individual.
#' @param type The type of missing data to generate, currently supports
#'           dropout, random, or missing at random (mar) missing data.
#' @param miss_cov Covariate that the missing values are based on.
#' @param mar_prop Proportion of missing data for each unique value 
#'   specified in the miss_cov argument.
#' @export 
#' @rdname missing
missing_data <- function(sim_data, resp_var = 'sim_data',
                         new_outcome = 'sim_data2',
                         clust_var = NULL, within_id = NULL, miss_prop = NULL,
                         dropout_location = NULL,
                         type = c('dropout', 'random', 'mar'),
                         miss_cov, mar_prop) {
  switch(type,
         dropout = dropout_missing(sim_data, resp_var, new_outcome, clust_var, 
                                   within_id, miss_prop, dropout_location),
         random = random_missing(sim_data, resp_var, new_outcome, miss_prop, 
                                 clust_var, within_id),
         mar = mar_missing(sim_data, resp_var, new_outcome, miss_cov, mar_prop)
         )
}

#' Tidy Missing Data Function
#' 
#' @param data Data simulated from other functions to pass to this function.
#' @param sim_args A named list with special model formula syntax. See details and examples
#'   for more information. The named list may contain the following:
#'   \itemize{
#'     \item fixed: This is the fixed portion of the model (i.e. covariates)
#'     \item random: This is the random portion of the model (i.e. random effects)
#'     \item error: This is the error (i.e. residual term).
#'   }
#' @export 
generate_missing <- function(data, sim_args) {
  
  resp_var <- parse_formula(sim_args)[['outcome']]
  
  purrr::exec(missing_data,
                !!!sim_args[['missing_data']],
                sim_data = data,
                resp_var = resp_var,
                within_id = 'level1_id')
  
  # purrr::invoke("missing_data",
  #               sim_args[['missing_data']],
  #               sim_data = data,
  #               resp_var = resp_var,
  #               within_id = 'level1_id')
}


#' @param sim_data Simulated data frame
#' @param resp_var Character string of response variable with complete data.
#' @param new_outcome Character string of new outcome variable name that includes
#'   the missing data.
#' @param clust_var Cluster variable used for the grouping, set to 
#'           NULL by default which means no clustering.
#' @param within_id ID variable within each cluster.
#' @param miss_prop Proportion of missing data overall 
#' @param dropout_location A vector the same length as the number of clusters 
#'   representing the number of data observations for each individual.
#' @export
#' @rdname missing 
dropout_missing <- function(sim_data, resp_var = 'sim_data', 
                            new_outcome = 'sim_data2', 
                        clust_var = 'clustID', within_id = "withinID", 
                        miss_prop = NULL, dropout_location = NULL) {
  
  if(resp_var %ni% names(sim_data)) {
    stop(paste(resp_var, 'not found in variables of data supplied'))
  }
  if(clust_var %ni% names(sim_data)) {
    stop(paste(clust_var, 'not found in variables of data supplied'))
  }
  
  sim_data <- data.frame(sim_data)
  
  len_groups <- tapply(sim_data[, resp_var], 
                       sim_data[, clust_var], 
                       length)
  
  num_obs <- length(unique(sim_data[, clust_var]))
  
  if(is.null(dropout_location)) {
    if(miss_prop > 1) {
      miss_prop <- miss_prop / 100
    }
    n_groups <- length(unique(sim_data[, clust_var]))
    drop_missing <- data.frame(clust_var = seq_len(n_groups))
    colnames(drop_missing) <- clust_var
    drop_missing['missing_clust'] <- ifelse(runif(n_groups) < miss_prop, 1, 0)
    
    len_groups_people_missing <- len_groups[drop_missing[drop_missing[['missing_clust']] == 1,'id']]
    
    dropout_location <- unlist(lapply(seq_along(drop_missing[['missing_clust']]), function(xx) 
      dropout_helper(drop_missing[['missing_clust']][xx], len_groups[xx]))
    )
  }
    
  num_missing <- round(len_groups - dropout_location, 0)

    
  missing_obs <- lapply(seq_along(num_missing), function(xx) 
    (len_groups[xx] - num_missing[xx]):len_groups[xx])
  
  data_split <- split(sim_data, sim_data[, clust_var])
  
  sim_data['missing'] <- do.call("c", lapply(seq_along(missing_obs), function(xx)
    dropout_locator(missing_number = num_missing[[xx]],
                    num_obs = len_groups[[xx]], missing_time = missing_obs[[xx]])
    )
    )
  
  sim_data[new_outcome] <- sim_data[resp_var]
  sim_data[sim_data['missing'] == 1, new_outcome] <- NA
  
  sim_data
}

dropout_helper <- function(data, num_obs) {
  if(data == 0) {
    num_obs
  } else {
    round(runif(1, min = 2, max = num_obs), 0)
  }
}

dropout_locator <- function(missing_number, num_obs, missing_time) {
  miss <- rep(0, num_obs)
  if(missing_number != 0) {
    miss[missing_time] <- 1
  }
  miss
}


#' @param sim_data Simulated data frame
#' @param resp_var Character string of response variable with complete data.
#' @param new_outcome Character string of new outcome variable name that includes
#'   the missing data.
#' @param miss_prop Proportion of missing data overall 
#' @param clust_var Cluster variable used for the grouping, set to 
#'           NULL by default which means no clustering.
#' @param within_id ID variable within each cluster.
#' @export 
#' @rdname missing
random_missing <- function(sim_data, resp_var = 'sim_data', 
                           new_outcome = 'sim_data2', miss_prop,
                           clust_var = NULL, within_id = "withinID") {
  if(resp_var %ni% names(sim_data)) {
    stop(paste(resp_var, 'not found in variables of data supplied'))
  }
  
  sim_data <- data.frame(sim_data)

  if(is.null(clust_var)) {
    sim_data['miss_prob'] <- round(runif(nrow(sim_data)), 3)
    sim_data['missing'] <- ifelse(sim_data['miss_prob'] < miss_prop, 1, 0)
    sim_data[new_outcome] <- sim_data[resp_var]
    sim_data[sim_data['missing'] == 1, new_outcome] <- NA
  } else {
    if(clust_var %ni% names(sim_data)) {
      stop(paste(clust_var, 'not found in variables of data supplied'))
    }
    
    len_groups <- tapply(sim_data[, resp_var], 
                         sim_data[, clust_var], 
                         length)
    
    num_obs <- nrow(sim_data)
    
    if(length(miss_prop) == 1) {
      if(miss_prop > 1) {
        miss_prop <- miss_prop / 100
      }
      total_missing <- num_obs * miss_prop
      missing_range <- round((total_missing *.98):(total_missing * 1.02))
      n_groups <- with(sim_data, length(unique(eval(parse(text = clust_var)))))
      
      lim <- prop_limits(miss_prop)
      
      num_missing <- 0
      while(sum(num_missing) %ni% missing_range) {
        num_missing <- round(len_groups * round(runif(n_groups, lim[1], 
                                                      lim[2]), 2))
      }
      
    } else {
      num_missing <- round(len_groups * miss_prop, 0)
    }
    
    missing_obs <- lapply(1:length(num_missing), function(xx) 
      sample(1:len_groups[xx], num_missing[xx]))
    
    data_split <- split(sim_data, sim_data[, clust_var])
    
    sim_data['missing'] <- do.call("c", lapply(1:length(missing_obs), function(xx)
      ifelse(data_split[[xx]][, within_id] %in% missing_obs[[xx]], 1, 0)))
    
    sim_data[new_outcome] <- sim_data[resp_var]
    sim_data[sim_data['missing'] == 1, new_outcome] <- NA
  }

  sim_data
}

#' @param sim_data Simulated data frame
#' @param resp_var Character string of response variable with complete data.
#' @param new_outcome Character string of new outcome variable name that includes
#'   the missing data.
#' @param miss_cov Covariate that the missing values are based on.
#' @param mar_prop Proportion of missing data for each unique value 
#'   specified in the miss_cov argument.
#' @importFrom dplyr count select left_join mutate
#' @export 
#' @rdname missing
mar_missing <- function(sim_data, resp_var = 'sim_data', 
                        new_outcome = 'sim_data2', miss_cov, mar_prop) {
  
  if(as.character(resp_var) %ni% names(sim_data)) {
    stop(paste(resp_var, 'not found in variables of data supplied'))
  }
  if(as.character(miss_cov) %ni% names(sim_data)) {
    stop(paste(miss_cov, 'not found in variables of data supplied'))
  }
  
  sim_data <- data.frame(sim_data)
  
  num_obs <- nrow(sim_data)

  var_enq <- rlang::sym(miss_cov)
  uniq_vals <- dplyr::count(sim_data, !!var_enq)
  
  if(nrow(uniq_vals) != length(mar_prop)) {
    uniq_vals[['group']] <- cut(uniq_vals[[miss_cov]], breaks = length(mar_prop), labels = FALSE)
    missing_prop <- data.frame(group = 1:length(mar_prop),
                               miss_prop = mar_prop)
    miss_per <- left_join(dplyr::select(uniq_vals, !!var_enq, group),
                      missing_prop, 
                      by = 'group')
  } else {
    miss_per <- cbind(dplyr::select(uniq_vals, !!var_enq), 
                      miss_prop = mar_prop)
  }
  sim_data <- dplyr::left_join(sim_data, miss_per, by = miss_cov)
  sim_data <- dplyr::mutate(sim_data, miss_prob = runif(nrow(sim_data)),
                            missing = ifelse(miss_prob < miss_prop, 1, 0))
  
  sim_data[new_outcome] <- sim_data[resp_var]
  sim_data[sim_data['missing'] == 1, new_outcome] <- NA
  
  sim_data
}
lebebr01/simglm documentation built on April 8, 2024, 9:03 p.m.