R/Conan.R

Defines functions Conan

Documented in Conan

#' Removes missing values (rows and column to obtain a large full matrix)
#' 
#' @description Removes missing values (rows and column alternatively) to obtain a large full matrix
#' 
#' @param X the dataset (matrix) with missing values
#' @param nbstep number of cutting steps (may remove several rows or columns at each step)
#' @param std (boolean) remove constant covariates
#' @param verbose (boolean) to print the result
#' @param coercing vector of the covariates to keep (names or index)
#' @param Xout (boolean) to export or not the reduced matrix (if not, indices are sufficient)
#' 
#' @return 
#' \item{individus_restants}{Index of remaining individuals}
#' \item{variables_restantes}{Index of remaining variables}
#' \item{X}{If \code{Xout=TRUE}, the reduced dataset without missing values}
#' 
#' @examples
#' data <- mtcars
#' datamiss = Terminator(target = data, wrath = 0.05) # 5% of missing values
#' datamiss
#' showdata(datamiss) # plot positions of the missing values
#' reduced = Conan(X = datamiss)
#' reduced
#'   
#' @export
Conan <- function(X = X, nbstep = Inf, std = FALSE, verbose = FALSE, coercing = NULL, Xout = TRUE) {
   steploc = 1
   nloc = nrow(X)
   ploc = ncol(X)
   n = nloc
   p = ploc
   individus_restants = 1:n
   variables_restantes = 1:p
   if (!is.null(coercing)) { # on a des variables a garder
      # on va purger les individus manquants sur ces variables (pas le choix)
      for (i in 1:length(coercing)) {
         if (is.numeric(coercing[i])) {
            j = coercing[i]
         } else {
            j = which(names(X) == coercing[i])[1] # si doublon on prend le premier
         }
         if (anyDuplicated(c(1:ploc, j)) > 0) { # si on ne tape pas a cote
            quimank = which(is.na(X[, j]))
            if (length(quimank) > 0) {
               X = X[-quimank, ]
               individus_restants = individus_restants[-quimank]
            }
         }
      }
      nloc = nrow(X)
      ploc = ncol(X)
   }
   # nbmank/prod(dim(basefull))
   if (nbstep == Inf) {
      nbstep = max(c(nloc, ploc))
   }
   if (verbose) {cat(dim(X))}
   if (std) { # nettoyage des constantes si demande
      sd_loc <- function(vect) {
         return(sd(vect[!is.na(vect)]))
      }
      quiconst = which(apply(X, 2, sd_loc) == 0)
      if (length(quiconst) > 0) {
         X = X[, -quiconst]
         variables_restantes = variables_restantes[-quiconst]
      }
   }
   if (verbose) {cat(dim(X))}
   quimank = which(is.na(X), arr.ind = TRUE)
   nbmank = dim(quimank)[1]
   M = Matrix::Matrix(0, nrow = nrow(X), ncol = ncol(X))
   M[quimank] = 1
   while (steploc <= nbstep & nbmank > 0 & nloc > 0 & ploc > 0) {
      if (verbose) {cat(steploc)}
      nbmank_var = colSums(M) / nloc # taux=nbmank colonne/ nombre de ligne
      nbmank_ind = rowSums(M) / ploc
      if (max(nbmank_var) > max(nbmank_ind)) { # on supprime variable
         quisuppr = which(nbmank_var == max(nbmank_var))
         X = X[, -quisuppr]
         M = M[, -quisuppr]
         variables_restantes = variables_restantes[-quisuppr]
      } else { # on supprime ligne
         if (verbose) {cat("lign")}
         quisuppr = which(nbmank_ind == max(nbmank_ind))
         if (length(quisuppr) > 0) { # si il y a quelque chose a supprimer
            X = X[-quisuppr, ]
            M = M[-quisuppr, ]
            individus_restants = individus_restants[-quisuppr]
            if (verbose) {cat(dim(X))}
            if (std) { # nettoyage des constantes si demande
               quiconst = which(apply(X, 2, sd_loc) == 0)
               if (length(quiconst) > 0) {
                  X = X[, -quiconst]
                  variables_restantes = variables_restantes[-quiconst]
               }
            }
         }
      }
      if (verbose) {cat(dim(X)) }
      nloc = nrow(X)
      ploc = ncol(X)
      quimank = which(is.na(X), arr.ind = TRUE)
      nbmank = dim(quimank)[1]
      steploc = steploc + 1
   }
   if (verbose) {
      cat(paste("I've killed ", n - nloc, "individuals and ", p - ploc, " covariates."))
   }
   if (Xout) {
      return(list(X = X, individus_restants = individus_restants, variables_restantes = variables_restantes))
   } else {
      return(list(individus_restants = individus_restants, variables_restantes = variables_restantes))
   }
}

Try the CorReg package in your browser

Any scripts or data that you put into this service are public.

CorReg documentation built on Feb. 20, 2020, 5:07 p.m.