R/Generate_NA.R

Defines functions NA_Generator

Documented in NA_Generator

#' Function to introduce additional NAs for prediction error calculation. 
#'
#' @param SNP_df Original data of SNP. It may contain some SNP
#' @param percent The final percentage of total NA's we want in the dataset. 
#'
#' @return a list of variables:  
#' 1. SNP_NA_df: the dataset with additional SNPs.   
#' 2. NA_percent_orig: original missing percentable in the dataset.   
#' 3. NA_percent_generate: introduced NA percentage.   
#' 4. NP_generate_positions: the positions of the introduced NA's.   
#' @export
#'
#' @examples
#' data("SNP_orig_sub")
#' ## original NA ratio is 0.018
#' SNP_NA_df02 <- NA_Generator(SNP_orig_sub, 0.2)
#' ## Introduced another 18% of NAs. 
#' 
#' SNP_NA_df <- NA_Generator(SNP_orig_sub, 0.01)
#' ## Should report an error: alread has 1.8% NA's, higher than the 
#' ## target percentage. 
#' 
NA_Generator <- function(SNP_df, percent) {
  m <- nrow(SNP_df)
  n <- ncol(SNP_df)
  ## Get the NA positions
  NA_positions <- which(is.na(SNP_df))
  
  ## Get the not NA positions and counts.
  notNA_positions <- which(!is.na(SNP_df))
  notNA_len <- length(notNA_positions)
  
  ## Calculate the original percentage of NA's
  NA_percent_orig <- length(NA_positions) / (m * n)
  
  ## Calculate the percentage and counts of NA we need to generate. 
  NA_percent_generate <- percent - NA_percent_orig
  size = notNA_len * NA_percent_generate
  
  ## If the size of the number of NA to be generated is smaller than 1, then
  ## we cannot introduce any more NA's under current required total percentage.
  if(size < 1) stop(paste("There are already", NA_percent_orig * 100,
                  "% of NAs in the dataset. Cannot generate more."))
  
  ## Introduce random NAs to the original dataset.
  NP_generate_positions <- sample(notNA_positions, size = size)
  SNP_NA_df <- SNP_df
  SNP_NA_df[NP_generate_positions] <- NA
  
  ## Return the list of related information. 
  return(list(SNP_NA_df = SNP_NA_df, NA_percent_orig = NA_percent_orig, 
              NA_percent_generate = NA_percent_generate, NP_generate_positions = NP_generate_positions))
}
GaoGN517/689_SNP_FastImpute documentation built on Jan. 2, 2020, 11:44 a.m.