R/na.cleaner.R
In analytics: Regression Outlier Detection, Stationary Bootstrap, Testing Weak Stationarity, NA Imputation, and Other Tools for Data Analysis

#' @title Missing Value Imputation
#' @description Missing value imputation based on different methods. Can handle continuous and
#' categorical variables.
#' @details Each of the available methods in this function may be the best choice for a particular
#' dataset, but since it is impossible to know which one it is in each particular case, Mode "all"
#' might be a good, robust choice.
#' For categorical variables, the only mode implemented is knn, so argument Mode really refers
#' only to the continuous variables.
#' @param dataset a matrix or data frame. May have continuous and/or categorical variables.
#' @param t1 the threshold value in interval 0-1 beyond which a record is deemed as having a high
#' \% of NAs. Default: 0.5.
#' @param t2 the threshold value in interval 0-1 beyond which a variable is deemed as having a
#' high \% of NAs. Default: 0.5.
#' @param auto If TRUE (the default), it will eliminate those records and/or variables deemed as
#' having a high \% of NAs. If FALSE, one handpicks which records/variables will be deleted.
#' @param maxDel1 the proportion in interval 0-1 of records that can at most be deleted.
#' Default: 0.2.
#' @param maxDel2 the proportion in interval 0-1 of variables that can at most be deleted.
#' Default: 0.3.
#' @param Mode a string specifying the imputation method to be used, among "mean" (default),
#' "median", "mean&lm", "median&lm", "knn".
#' @param neigh the neighbours to be used in knn, both for continuous and categorical variables.
#' Default: interval 3-7.
#' For each value in neigh, knn is run, and then in the case of continuous variables, the outcome
#' of those runs
#' are averaged out. In the case of categorical variables, the imputed value is the most common
#' imputed value across runs.
#' @return the original dataset with imputed missing values.
#'
#' @author Albert Dorador
#' @export
#' @importFrom VIM kNN
#' @seealso
#' \code{\link[VIM]{kNN}}
#' \code{\link[analytics]{rowmean}}
#' @examples
#'
#' mtcars_mod <- mtcars
#' set.seed(1)
#' mtcars_mod <- as.data.frame(lapply(mtcars_mod, function(cc) cc[ sample(c(TRUE, NA),
#' prob = c(0.6, 0.4), size = length(cc), replace = TRUE) ]))
#' rownames(mtcars_mod) <- rownames(mtcars)
#'
#' # Compare methods
#' kNN_dt <- na.cleaner(dataset = mtcars_mod, Mode = "kNN")
#' mean_lm_dt <- na.cleaner(dataset = mtcars_mod, Mode = "mean&lm")
#' median_dt <- na.cleaner(dataset = mtcars_mod, Mode = "median")
#' all_dt <- na.cleaner(dataset = mtcars_mod, Mode = "all")
#' dev_kNN <- norm(as.matrix(mtcars[-c(4,6,8,13,18,20), -6])-as.matrix(kNN_dt))
#' dev_m_ml <- norm(as.matrix(mtcars[-c(4,6,8,13,18,20), -6])-as.matrix(mean_lm_dt))
#' dev_md <- norm(as.matrix(mtcars[-c(4,6,8,13,18,20), -6])-as.matrix(median_dt))
#' dev_all <- norm(as.matrix(mtcars[-c(4,6,8,13,18,20), -6])-as.matrix(all_dt))
#'
#' iris_mod <- iris
#' set.seed(5)
#' iris_mod <- as.data.frame(lapply(iris_mod, function(cc) cc[ sample(c(TRUE, NA),
#' prob = c(0.6, 0.4), size = length(cc), replace = TRUE) ]))
#' rownames(iris_mod) <- rownames(iris)
#' na.cleaner(dataset = iris_mod, neigh = 1, Mode = "all")
#'

na.cleaner <- function(dataset, t1 = 0.5, t2 = 0.5, auto = TRUE, maxDel1 = 0.2, maxDel2 = 0.3, Mode = "mean", neigh = 3:7){
  nrows0 <- nrow(dataset)
  ncols0 <- ncol(dataset)
  rownames(dataset) <- 1:nrows0
  colnames(dataset) <- 1:ncols0
  bNAs <- is.na(dataset)
  if (!any(bNAs))
    stop("There are no missing values in the original dataset provided.")

  if (is.null(rownames(dataset))){
    brow_names <- FALSE
  } else {
    brow_names <- TRUE
    row_names <- rownames(dataset)
  }

  if (is.null(colnames(dataset))){
    bcol_names <- FALSE
  } else {
    bcol_names <- TRUE
    col_names <- colnames(dataset)
  }

  NAs_by_obs <- rowSums(bNAs) # 1 by m
  NAs_by_obs_prop <- rowMeans(bNAs)
  NAs_by_var <- colSums(bNAs) # 1 by n
  NAs_by_var_prop <- colMeans(bNAs)

  info_list <- list("NAs by record" = NAs_by_obs,
                    "NAs by record, proportion" = NAs_by_obs_prop,
                    "NAs by variable" = NAs_by_var,
                    "NAs by variable, proportion" = NAs_by_var_prop)
  print(info_list)

  quick_NAs_by_obs_prop <- which(NAs_by_obs_prop > t1)
  unnamed_quick_NAs_by_obs_prop <- unname(quick_NAs_by_obs_prop)
  quick_NAs_by_variable_prop <- which(NAs_by_var_prop > t2)
  unnamed_quick_NAs_by_variable_prop <- unname(quick_NAs_by_variable_prop)

  quick_info_list <- list("Index of records with high % of NAs" = unnamed_quick_NAs_by_obs_prop,
                          "Index of Variables with high % of NAs" = unnamed_quick_NAs_by_variable_prop)

  print(quick_info_list)

  new_dataset <- dataset

  # If auto enabled, get rid of any rows and/or columns up to maxDel1 and/or maxDel2
  if (auto) {

    sorted_NAs_by_obs <- sort(x = quick_NAs_by_obs_prop, decreasing = TRUE)
    sorted_NAs_by_variable <- sort(x = quick_NAs_by_variable_prop, decreasing = TRUE)
    num_maxDel1 <- floor(maxDel1*nrows0)
    num_maxDel2 <- floor(maxDel2*ncols0)
    num_high_NAs_obs <- length(quick_NAs_by_obs_prop)
    del_rows <- NULL
    if (num_high_NAs_obs > 0){
      num_to_delete_1 <- min(num_high_NAs_obs, num_maxDel1)
      del_rows <- as.integer(names(sorted_NAs_by_obs[1:num_to_delete_1]))
      new_dataset <- new_dataset[-del_rows,]
    }
    num_high_NAs_var <- length(quick_NAs_by_variable_prop)
    del_cols <- NULL
    if (num_high_NAs_var > 0){
      num_to_delete_2 <- min(num_high_NAs_var, num_maxDel2)
      del_cols <- as.integer(names(sorted_NAs_by_variable[1:num_to_delete_2]))
      new_dataset <- new_dataset[,-del_cols,drop = FALSE]
    }
  } else {
    # If manual, first we ask the user if she wants to get rid of any rows or columns

    prompt <- readline("Please provide comma-separatedly the records you want to delete - insert 0 if none \n")

    del_rows <- NULL
    if (prompt != "0"){
      del_rows <- as.integer(strsplit(prompt, ",")[[1]])
      new_dataset <- new_dataset[-del_rows,]
    }

    prompt <- readline("Please provide comma-separatedly the variables you want to delete - insert 0 if none \n")

    del_cols <- NULL
    if (prompt != "0"){
      del_cols <- as.integer(strsplit(prompt, ",")[[1]])
      new_dataset <- new_dataset[,-del_cols,drop = FALSE]
    }
  }

  # After possibly sanitizing the dataset, let's input missing values according to Mode
  # and only if there are any NAs (left) in the dataset
  if (anyNA(new_dataset)){

    cont_var_idx <- which(!sapply(new_dataset, is.factor))
    there_are_cont <- length(cont_var_idx > 0)
    categ_var_idx <- which(sapply(new_dataset, is.factor))
    there_are_categ <- length(categ_var_idx > 0)

    # A) Continuous variables first #
    # ----------------------------- #

    if (there_are_cont){
      cont.new_dataset <- new_dataset[, cont_var_idx, drop = FALSE]

      bNAs <- is.na(cont.new_dataset) #compute again because numrows and/or cols may have changed

      Mode <- tolower(Mode)
      ncols <- ncol(cont.new_dataset)

      if (Mode == "all"){
        ALL <- TRUE
        Mode <- "mean"
      } else {ALL <- FALSE}

      if (Mode == "mean"){
        var_means <- colMeans(cont.new_dataset, na.rm = TRUE)
        cont.new_dataset_final <- cont.new_dataset
        for (j in 1:ncols){
          jcolNAs <- bNAs[,j]
          cont.new_dataset_final[,j][jcolNAs] <- var_means[j]
        }
        if (ALL){Mode <- "median"; cont.new_dataset_1 <- cont.new_dataset_final}
      }

      if (Mode == "median"){
        var_medians <- apply(X = cont.new_dataset, MARGIN = 2, FUN = median, na.rm = TRUE)
        cont.new_dataset_final <- cont.new_dataset
        for (j in 1:ncols){
          jcolNAs <- bNAs[,j]
          cont.new_dataset_final[,j][jcolNAs] <- var_medians[j]
        }
        if (ALL){Mode <- "mean&lm"; cont.new_dataset_2 <- cont.new_dataset_final}
      }
      if (Mode == "mean&lm"){
        var_means <- colMeans(cont.new_dataset, na.rm = TRUE)
        cont.new_dataset_final <- cont.new_dataset
        for (j in 1:ncols){
          rng <- j:ncols
          for (jj in rng){
            if (jj < ncols){
              jjcolNAs <- bNAs[,jj+1]
              cont.new_dataset_final[,jj+1][jjcolNAs] <- var_means[jj+1]
            }
          }
          jcolNAs <- bNAs[,j]
          y <- cont.new_dataset_final[!jcolNAs,j] # we want to predict column j (excluding NAs)...
          X <- as.matrix(cont.new_dataset_final[!jcolNAs,-j]) # ... with the rest of columns
          model <- lm(y ~ X)
          Xpred <- cont.new_dataset_final[jcolNAs,-j] #the records that have an NA in column j
          Xpred <- cbind(rep(1,sum(jcolNAs)), Xpred) #attach 1s for Beta0
          ypred <- as.matrix(Xpred) %*% as.matrix(model$coefficients)
          cont.new_dataset_final[,j][jcolNAs] <- ypred
          print(paste0("Adjusted R^2 for variable ", j, ": ",summary(model)$adj.r.squared))
        }
        if (ALL){Mode <- "median&lm"; cont.new_dataset_3 <- cont.new_dataset_final}
      }
      if (Mode == "median&lm"){
        var_medians <- apply(X = cont.new_dataset, MARGIN = 2, FUN = median, na.rm = TRUE)
        cont.new_dataset_final <- cont.new_dataset
        for (j in 1:ncols){
          rng <- j:ncols
          for (jj in rng){
            if (jj < ncols){
              jjcolNAs <- bNAs[,jj+1]
              cont.new_dataset_final[,jj+1][jjcolNAs] <- var_medians[jj+1]
            }
          }
          jcolNAs <- bNAs[,j]
          y <- cont.new_dataset_final[!jcolNAs,j] # we want to predict column j (excluding NAs)...
          X <- as.matrix(cont.new_dataset_final[!jcolNAs,-j]) # ... with the rest of columns
          model <- lm(y ~ X)
          Xpred <- cont.new_dataset_final[jcolNAs,-j] #the records that have an NA in column j
          Xpred <- cbind(rep(1,sum(jcolNAs)), Xpred) #attach 1s for Beta0
          ypred <- as.matrix(Xpred) %*% as.matrix(model$coefficients)
          cont.new_dataset_final[,j][jcolNAs] <- ypred
          print(paste0("Adjusted R^2 for variable ", j, ": ",summary(model)$adj.r.squared))
        }
        if (ALL){Mode <- "knn"; cont.new_dataset_4 <- cont.new_dataset_final}
      }
      if (Mode == "knn"){
        cont.new_dataset_final <- cont.new_dataset

        # if more than 1 run of kNN
        if (length(neigh) > 1){
          datalist <- list()
          counter <- 0
          for (i in neigh) {
            counter <- counter + 1
            datalist[[counter]] <- kNN(data = cont.new_dataset_final, k = i, imp_var = FALSE)
          }
          cont.Tot_kNN_dataset <- do.call(cbind, datalist)

          group <- rep(1:ncols, length(neigh))

          # Average across kNN repetitions using a different #neigh
          cont.new_dataset_final <- t(rowmean(t(cont.Tot_kNN_dataset), group = group))

        } else {
          cont.new_dataset_final <- kNN(data = cont.new_dataset_final, k = neigh, imp_var = FALSE)
        }

        if (length(del_rows) > 0){
          rownames(cont.new_dataset_final) <- (1:nrows0)[-del_rows]
        } else {
          rownames(cont.new_dataset_final) <- 1:nrows0
        }
        if (length(del_cols) > 0){
          colnames(cont.new_dataset_final) <- (1:ncols0)[-del_cols]
        } else {
          colnames(cont.new_dataset_final) <- 1:ncols
        }
        if (ALL){cont.new_dataset_5 <- cont.new_dataset_final}
      }

      if (ALL){
        cont.Total_new_dataset <- rbind(as.matrix(cont.new_dataset_1),
                                        as.matrix(cont.new_dataset_2),
                                        as.matrix(cont.new_dataset_3),
                                        as.matrix(cont.new_dataset_4),
                                        as.matrix(cont.new_dataset_5))
        cont.new_dataset_final <- rowmean(cont.Total_new_dataset)
      }
    }

    # B) Categorical variables next #
    # ----------------------------- #

    if (there_are_categ){
      if (there_are_cont) {
        cat.new_dataset <- new_dataset
        cat.new_dataset[,cont_var_idx] <- cont.new_dataset_final #We take all vars and AFTER imputing NAs
        colnames(cat.new_dataset) <- colnames(new_dataset)
      } else {
        cat.new_dataset <- new_dataset
      }

      # if more than 1 run of knn
      if (length(neigh) > 1){
        a.mode <- function(x) {
          ux <- unique(x)
          ux[which.max(tabulate(match(x, ux)))]
        }

        datalist <- list()

        counter <- 0
        for (i in neigh) {
          counter <- counter + 1
          datalist[[counter]] <- kNN(data = cat.new_dataset, variable = colnames(cat.new_dataset)[categ_var_idx], k = i, imp_var = FALSE)
        }
        cat.Tot_kNN_dataset <- do.call(cbind, datalist)

        categ_var_idx_2 <- which(sapply(cat.Tot_kNN_dataset, is.factor))
        only_cat_vars <- cat.Tot_kNN_dataset[,categ_var_idx_2,drop = FALSE]
        cat.ncols <- length(categ_var_idx)
        group <- rep(1:cat.ncols, length(neigh))

        # Find mode by variable, across kNN repetitions using a different #neigh
        cat.new_dataset_final <- t(aggregate(t(only_cat_vars), list(group), a.mode))
        cat.new_dataset_final <- as.data.frame(cat.new_dataset_final[-1,]) #remove 1st row 'cause is just title

      } else {
        cat.new_dataset_final <- kNN(data = cat.new_dataset, variable = colnames(cat.new_dataset)[categ_var_idx], k = neigh, imp_var = FALSE)
        categ_var_idx_2 <- which(sapply(cat.new_dataset_final, is.factor))
        cat.new_dataset_final <- cat.new_dataset_final[,categ_var_idx_2,drop = FALSE]
      }

    }

    if (there_are_cont & there_are_categ){

      # Join the continuous and categorical parts of the dataset in original order
      new_dataset_final <- cbind(cont.new_dataset_final, cat.new_dataset_final)
      new_dataset_final <- new_dataset_final[, c(cont_var_idx, categ_var_idx)]
    } else if (there_are_cont){
      new_dataset_final <- cont.new_dataset_final
    } else if (there_are_categ){
      new_dataset_final <- cat.new_dataset_final
    }

    new_dataset <- new_dataset_final # to make it compatible with a df without NAs

  }

  if (brow_names){
    if (length(del_rows) > 0){
      rownames(new_dataset) <- row_names[-del_rows]
    } else {
      rownames(new_dataset) <- row_names
    }

  } else {
    rownames(new_dataset) <- NULL
  }

  if (bcol_names){
    if (length(del_cols) > 0){
      colnames(new_dataset) <- col_names[-del_cols]
    } else {
      colnames(new_dataset) <- col_names
    }

  } else {
    colnames(new_dataset) <- NULL
  }

  return(new_dataset)
}

Any scripts or data that you put into this service are public.

analytics documentation built on May 2, 2019, 3:37 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

analytics
Regression Outlier Detection, Stationary Bootstrap, Testing Weak Stationarity, NA Imputation, and Other Tools for Data Analysis

R/na.cleaner.R
In analytics: Regression Outlier Detection, Stationary Bootstrap, Testing Weak Stationarity, NA Imputation, and Other Tools for Data Analysis

Try the analytics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

analytics Regression Outlier Detection, Stationary Bootstrap, Testing Weak Stationarity, NA Imputation, and Other Tools for Data Analysis

R/na.cleaner.R In analytics: Regression Outlier Detection, Stationary Bootstrap, Testing Weak Stationarity, NA Imputation, and Other Tools for Data Analysis

Try the analytics package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

analytics
Regression Outlier Detection, Stationary Bootstrap, Testing Weak Stationarity, NA Imputation, and Other Tools for Data Analysis

R/na.cleaner.R
In analytics: Regression Outlier Detection, Stationary Bootstrap, Testing Weak Stationarity, NA Imputation, and Other Tools for Data Analysis