R/impute_variables.R

#' @title Impute missing observations for a given Dataset
#' 
#' @description Impute missing observations for a given dataset.
#' Missing observations can be imputed using the mean, the mode, multiple linear regression or binominal logistic regression.
#' When using mice package, the NA values must only be in one column.
#' 
#' @param dataset The dataset that the power terms are derived from
#' 
#' @param percentage The percentage specifies identifies the attributes with a missing number of observations
#' 
#' @param y_index A natural number specifying the column in the data frame to be imputed
#' 
#' @param type The type of imputation to be used. 
#' Either "mean", "mode" or "mice".
#' 
#' @param method The method of imputation to be used in conjungtion with type equal to mice.
#' 
#' @param file_name A character object indicating the file name when saving the data frame.
#' The default is NULL.
#' The name must include the .csv suffixs.
#'
#' @param directory A character object specifying the directory where the data frame is to be saved as a .csv file.
#'
#' @return Outputs the imputed data as a data frame
#' 
#' @import mice
#' 
#' @export
#' 
#' @seealso \code{\link{remove_variables}}, \code{\link{derive_variables}}, \code{\link{extract_variables}}, \code{\link{standardise_variables}}, \code{\link{transform_variables}}
#' 
impute_variables <- function(dataset, 
                             percentage = NULL, 
                             y_index = NULL, 
                             type = c("mean", "mode", "mice"), 
                             method = c("cart", "rf", "norm.predict", "norm.boot", "logreg", "logreg.boot"), 
                             file_name = NULL, 
                             directory = NULL) 
  {
  
  # Convert the dataset set to a data frame
  dataset <- as.data.frame(dataset)
  
  # Confirm correct choice for type
  type <- match.arg(type)
  
  #------------------------------------------------------------------#
  # If y_index is NULL                                               #
  #------------------------------------------------------------------#
  
  if(is.null(y_index)){
    
    #------------------------------------------------------------------#
    # If type is mean                                                  #
    #------------------------------------------------------------------#
    
    if(type == "mean"){
      
      for (i in 1:ncol(dataset)) {
        
        missing_prc <- (sum(is.na(x = dataset[,i])) * 100) / nrow(dataset) 
        
        if(is.numeric(dataset[,i]) && 0 < missing_prc && missing_prc < percentage) {
          
          mean <- mean(dataset[,i], na.rm = T)
          
          dataset[which(is.na(dataset[,i])),i] <- mean
        
        } else if(is.numeric(dataset[,i]) && is.null(percentage)){
          
          mean <- mean(dataset[,i], na.rm = T)
          
          dataset[which(is.na(dataset[,i])),i] <- mean
          
        }
        
      }
      
      #------------------------------------------------------------------#
      # If type is mode                                                  #
      #------------------------------------------------------------------#
    
    } else if(type == "mode"){
      
      for (i in 1:ncol(dataset)) {
        
        missing_prc <- (sum(is.na(x = dataset[,i])) * 100) / nrow(dataset) 
        
        if (is.factor(dataset[,i]) && 0 < missing_prc && missing_prc < percentage) {
          
          mode <- attributes(summary(dataset[,i]))$names[which.max(summary(dataset[,i]))]
          
          dataset[which(is.na(dataset[,i])),i] <- mode
        
        } else if(is.factor(dataset[,i]) && is.null(percentage)){
          
          mode <- attributes(summary(dataset[,i]))$names[which.max(summary(dataset[,i]))]
          
          dataset[which(is.na(dataset[,i])),i] <- mode
        
        }
        
      }
      
      #------------------------------------------------------------------#
      # If type is mice                                                  #
      #------------------------------------------------------------------#
    
    } else if (type == "mice"){
      
      impute_data <- mice(data = dataset, method = method)
      
      dataset <- complete(impute_data)
      
    }
    
    #----------------------------------------------------------------------#
    # If y_index is not NULL                                               #
    #----------------------------------------------------------------------#
  
  } else if(!is.null(y_index)){
    
    #------------------------------------------------------------------#
    # If type is mean                                                  #
    #------------------------------------------------------------------#
    
    if(type == "mean"){
      
       missing_prc <- (sum(is.na(x = dataset[,y_index])) * 100) / nrow(dataset)
       
       if(is.numeric(dataset[,y_index]) & (0 < missing_prc) & (missing_prc < percentage)){
        
          mean <- mean(dataset[,y_index], na.rm = T)
         
          dataset[which(is.na(dataset[,y_index])), y_index] <- mean
       
        } else if(is.numeric(dataset[,y_index]) & is.null(percentage)){
         
          mean <- mean(dataset[,y_index], na.rm = T)
         
          dataset[which(is.na(dataset[,y_index])),y_index] <- mean
          
        }
       
        #------------------------------------------------------------------#
        # If type is mode                                                  #
        #------------------------------------------------------------------#
    
      } else if(type == "mode"){
      
        missing_prc <- (sum(is.na(x = dataset[,y_index])) * 100) / nrow(dataset) 
      
        if (is.factor(dataset[,y_index]) && 0 < missing_prc && missing_prc < percentage) {
        
          mode <- attributes(summary(dataset[,y_index]))$names[which.max(summary(dataset[,y_index]))]
        
          dataset[which(is.na(dataset[,y_index])),y_index] <- mode
      
        } else if(is.factor(dataset[,y_index]) && is.null(percentage)){
        
          mode <- attributes(summary(dataset[,y_index]))$names[which.max(summary(dataset[,y_index]))]
        
          dataset[which(is.na(dataset[,y_index])),y_index] <- mode
          
        }
        
        #-------------------------------------------------------------------#
        # If type is mice                                                   #
        #-------------------------------------------------------------------#
    
      } else if (type == "mice"){
      
        impute_data <- mice(data = dataset, method = method)
      
        dataset <- complete(impute_data)
      }
    
  }
  
  if(!is.null(directory)) {
    
    write.csv(x = dataset, 
              file = paste(directory, "/", file_name, sep = ""), 
              row.names = F)
    
  }
  
  # return the imputed dataset
  return(dataset)
  
}
oislen/BuenaVista documentation built on May 16, 2019, 8:12 p.m.