R/imputeRobustChain.R

Defines functions useRobustNumeric useRobustMN imp imputeRobustChain

Documented in imputeRobustChain

#' @title FUNCTION_TITLE
#' @description FUNCTION_DESCRIPTION
#' @param formulas PARAM_DESCRIPTION, Default: vector("list", ncol(data))
#' @param data PARAM_DESCRIPTION
#' @param boot PARAM_DESCRIPTION, Default: TRUE
#' @param robustboot PARAM_DESCRIPTION, Default: TRUE
#' @param method PARAM_DESCRIPTION, Default: 'lts'
#' @param multinom.method PARAM_DESCRIPTION, Default: 'multinom'
#' @param takeAll PARAM_DESCRIPTION, Default: TRUE
#' @param eps PARAM_DESCRIPTION, Default: 0.5
#' @param maxit PARAM_DESCRIPTION, Default: 4
#' @param alpha PARAM_DESCRIPTION, Default: 0.5
#' @param uncert PARAM_DESCRIPTION, Default: 'pmm'
#' @param familiy PARAM_DESCRIPTION, Default: 'Gaussian'
#' @param value_back PARAM_DESCRIPTION, Default: 'matrix'
#' @param trace PARAM_DESCRIPTION, Default: FALSE
#' @return OUTPUT_DESCRIPTION
#' @details DETAILS
#' @examples 
#' \dontrun{
#' if(interactive()){
#'  #EXAMPLE1
#'  }
#' }
#' @seealso 
#'  \code{\link[VIM]{initialise}}
#'  \code{\link[robCompositions]{outCoDa}}
#' @rdname imputeRobustChain
#' @export 
#' @importFrom VIM initialise
imputeRobustChain <- function(formulas = vector("list", ncol(data)), 
                              data, 
                              boot = TRUE,
                              robustboot = TRUE,
                              method = "lts",
                              multinom.method = "multinom",
                              takeAll = TRUE,
                              eps = 0.5,
                              maxit = 4,
                              alpha = 0.5, 
                              uncert = "pmm", 
                              familiy = "Gaussian", 
                              value_back = "matrix", 
                              trace = FALSE){
  supportedMethods <- c("lts", "lm", "gam", "MM")
  #mf <- model.frame(form, data = data)
  #y <- model.extract(mf, "response")
  #x_vars <- all.vars(form[-1]) 
  #y_var <- all.vars(form[1])
  rn <- rownames(data)
  n <- nrow(data)
  p <- ncol(data)
  #ynam <- as.character(form[2])
  #my <- is.na(data[, ynam])
  M <- is.na(data)
  m <- 1 # currently not further considered
  
  class1 <- function(x) class(x)[1]
  types <- lapply(data, class1)
  
  attributes(types)$names <- NULL
  types <- unlist(types)
  if (any(types == "character")) {
    chr_ind <- which(types == "character")
    warning("At least one character variable is converted into a factor")
    for (ind in chr_ind) {
      data[, ind] <- as.factor(data[, ind])
      types[ind] <- "factor"
    }
  }
  
  #determine factor type: dichotomous or polytomous
  #detect problematic factors
  ind_fac <- which(types == "factor")
  for (ind in ind_fac) {
    #get number of levels
    fac_nlevels <- nlevels(data[[ind]])
    if (fac_nlevels < 2)
      stop(sprintf("factor with less than 2 levels detected! - `%s`", names(data)[ind]))
    types[ind] = ifelse(fac_nlevels == 2, "binary", "nominal")
  }
  ind_ord <- which(types == "ordered")
  for (ind in ind_ord) {
    #get number of levels
    fac_nlevels <- nlevels(data[[ind]])
    if (fac_nlevels == 2)
      types[ind] <- "binary"
  }
  
  vars_miss <- apply(data, 2, function(x) any(is.na(x)))
  
  # initialize if takeAll
  if(takeAll){
    data <- VIM::initialise(data, mixed = NULL)
  }
  if(length(ind_fac) > 0){
    outCoDa <- function(x, quantile=0.975, method="robust", 
                        alpha = 0.5, coda = TRUE){
      ## Funktion from the same author from package robCompositions
      if(dim(x)[2] < 2) stop("need data with at least 2 variables")
      if(alpha < 0.5 | alpha > 1) stop("allowed values for h are between 0.5 and 1")
      covEst <- function(x, type) {
        standard <- function(x){
          list(mean=colMeans(x, na.rm=TRUE), varmat=cov(x))  
        }
        robust <- function(x){
          v <- robustbase::covMcd(x, alpha = alpha)
          list(mean=v$center, varmat=v$cov)
        }
        robustHD <- function(x){
          v <- robustbase::covOGK(x, sigmamu = robustbase::s_Qn)
          list(mean=v$center, varmat=v$cov)
        }
        switch(type,
               standard = standard(x),
               robust = robust(x),
               robustHD = robustHD(x))
      }
      if(!is.logical(coda) & !is.function(coda)){
        stop("coda must be logical or function")
      }
      if(!is.logical(coda)){
        x <- coda(x)
      }	else if (coda){ 
        x <- pivotCoord(x)
      }  
      cv <- covEst(x, "robust")
      cvc <- covEst(x, "standard")
      dM <- sqrt(mahalanobis(x, center=cv$mean, cov=cv$varmat))
      dMc <- sqrt(mahalanobis(x, center=cvc$mean, cov=cvc$varmat))
      #	limit <- sqrt(qchisq(p=quantile, df=ncol(x)-1))
      limit <- sqrt(qchisq(p=quantile, df=ncol(x)))	
      res <- list(mahalDist = dM, limit = limit, 
                  outlierIndex = dM > limit, method=method, 
                  om2 = dMc > limit,
                  m2=dMc, coda = coda)
      class(res) <- "outCoDa"
      invisible(res)
    }
    out_index <- outCoDa(data[, types %in% c("integer", "numeric", "count")])$outlierIndex
  } else{
    out_index <- NULL
  }
  
  ## outer loop
  criteria <- 999999
  iterations <- 0
  while(criteria > eps & iterations < maxit){
    iterations <- iterations + 1
    if (trace) {
      if (Sys.info()[1] == "Windows") flush.console()
      message("----------------------")
      message(paste("start of iteration =", iterations))
    }
    data_previous <- data
    ## inner loop
    for(i in (1:p)[vars_miss]){
      if (trace) {
        message(paste("impute variable:", i, "(", colnames(data)[i], ")"))
        if (Sys.info()[1] == "Windows") flush.console()
      }
      
      meth <- switch(
        types[i],
        integer = "numeric",
        numeric = "numeric",
        # mixed = "numeric",
        binary = "bin",
        nominal = "factor",
        count = "count",
        ordered = "ordered",
        logical = "bin",
        stop("unsupported variable type for column ", i)
      )
      
      ## replace initialised missings:
      if (is.null(formulas[[i]])) {
        xvars <- colnames(data)[-i]
        formulas[[i]] <- formula(paste0(colnames(data)[i], "~", 
                                        paste(xvars, collapse = "+")))
      }
      if (trace) {
        message(paste("formula:", format(formulas[[i]])))
        if (Sys.info()[1] == "Windows") flush.console()
      }
      if (trace){
        if(meth == "numeric"){
          message(paste(meth, method))           
        } else{
          message(meth)
        }
      }
      data[M[, i], i] <- imp(
        form = formulas[[i]], 
        data, 
        type = types[i],
        method = method,
        multinom.method = multinom.method,
        index = M[, i], #mixed_tf = mixed_tf, mixed_constant = mixed_constant,
        factors = factors, #step = step, robust = robust, noise = FALSE,
        boot = boot,
        robustboot = robustboot,
        uncert = uncert,
        outIndex = out_index
        #force = force, robMethod, form = active_formula,
        #multinom.method = multinom.method
      )
      #if(!testdigits(x$x5)) stop()
      
    }
    if (any(types %in% c("numeric", "mixed")))
      d <- sum( (data_previous[, types %in% "numeric"] -
                   data[, types %in% "numeric"]) ^ 2,
                na.rm = TRUE) 
    if (any(!types %in% c("numeric")))
      d <- d + 
      sum(data_previous[, !types %in% c("numeric")] != 
            data[, !types %in% c("numeric")])
  }
  if (trace) {
    if (Sys.info()[1] == "Windows") flush.console()
    message(paste("final value of criteria =", d))
    message("----------------------")
  }
  return(data)
}

## switch function to automatically select methods
imp <- function(form, data, type, method, multinom.method, index, factors, boot, robustboot, uncert, outIndex) {
  switch(
    type,
    numeric = useRobustNumeric(form, data, method, index, factors, boot, robustboot, uncert),
    nominal =   useRobustMN(form, data, index, factors, boot, robustboot, uncert, 
                            multinom.method = multinom.method, outIndex),
    bin = useLogistic(x_reg, ndata, index, factors, step, robust, form = form),
    count   = useGLMpoisson(x_reg, ndata, index, factors, step, robust,
                            form = form)#,
    #ordered  = useOrd(x_reg, ndata, index, factors, step, robust, form = form),
  )
}


# categorical response
useRobustMN <- function(form, data, index, factors, boot, robustboot, uncert, multinom.method, outIndex){
  
  n <- nrow(data)
  mf <- model.frame(form, data = data)
  y <- model.extract(mf, "response")
  x_vars <- all.vars(form[-1]) 
  y_var <- all.vars(form[1])
  ynam <- as.character(form[2])
  
  # factors <- Inter(list(colnames(x_reg), factors))
  # if (length(factors) > 0) {
  #   for (f in 1:length(factors)) {
  #     if (any(summary(x_reg[, factors[f]]) == 0)) {
  #       x_reg <- x_reg[, -which(colnames(x_reg) == factors[f])]
  #       ndata <- ndata[, -which(colnames(ndata) == factors[f])]
  #     }
  #   }
  # }
  # form <- form[form %in% names(x_reg)]
  if (multinom.method == "multinom") {
    # browser()
    co <- capture.output(multimod <- nnet::multinom(
      form, data = data, summ = 2, maxit = 50, trace = FALSE, MaxNWts = 50000))
    if(boot){
      if(robustboot){
        boot_idx <- sample(x = which(outIndex), size = n, replace = TRUE)
      } else {
        boot_idx <- sample(x = 1:n, size = n, replace = TRUE)       
      }
      boot_dat <- data[boot_idx, ]
      mod <- lm(form, data = boot_dat)
    } else {
      mod <- lm(form, data = data[mod$best, ])  
    }
    # impnew <- predict(multimod, newdata = data[index, ])
    prob_predictions <- predict(multimod, newdata = data[index, ], type = "probs")
    impnew <- apply(prob_predictions, 1, function(x) sample(names(x), 1, prob = x))
  } else if(multinom.method == "enetLTS"){
    stop("enetLTS not implemented yet.")
    multimod <- enetLTS::enetLTS(xx = mf[, 2:ncol(mf)], yy = mf[, 1])
  } else if(multinom.method == "robmixglm"){
    stop("robmixglm not implemented yet")
    multimod <- robmixglm::robmixglm(form, data = data, family = "poisson")
  }
  return(impnew)
}


useRobustNumeric <- function(form, data, method, index, factors, boot, robustboot, uncert) 
  #            mixed_tf, mixed_constant, factors, step,
  # robust, noise, noise.factor, force, robMethod, form) 
{
  n <- nrow(data)
  mf <- model.frame(form, data = data)
  y <- model.extract(mf, "response")
  x_vars <- all.vars(form[-1]) 
  y_var <- all.vars(form[1])
  ynam <- as.character(form[2])
  
  if(method == "lts"){
    mod <- ltsReg(form, data = data)
    if(boot){
      if(robustboot){
        boot_idx <- sample(x = mod$best, size = n, replace = TRUE)
      } else {
        boot_idx <- sample(x = 1:n, size = n, replace = TRUE)       
      }
      boot_dat <- data[boot_idx, ]
      mod <- lm(form, data = boot_dat)
    } else {
      mod <- lm(form, data = data[mod$best, ])  
    }
    sdev <- sd(resid(mod))
  } else if(method == "lm"){
    mod <- lm(form, data = data)
    if(boot){
      if(robustboot){
        idx <- which(resid(mod) < quantile(resid(mod), alpha))
        boot_idx <- sample(x = idx, size = n, replace = TRUE)
      } else{
        boot_idx <- sample(x = 1:n, size = n, replace = TRUE)       
      }
      boot_dat <- data[boot_idx, ]
      mod <- lm(form, data = boot_dat)
    }
    sdev <- sd(resid(mod))
  } else if(method == "MM"){
    mod <- robustbase::lmrob(form, data = data) 
    if(boot){
      if(robustboot){
        boot_idx <- sample(1:n, size = n, replace = TRUE, prob = mod$rweights)       
      } else {
        boot_idx <- sample(1:n, size = n, replace = TRUE)           
      }
      boot_dat <- data[boot_idx, ]
      mod <- robustbase::lmrob(form, data = boot_dat)     
    }
    sdev <- summary(mod)$scale
  } else if(method == "gam"){
    if (requireNamespace("mgcv", quietly = TRUE)) {
      mgcv::gam(form, data = data)
    } else {
      stop("Package 'mgcv' is required for this function.")
    }
    if(boot){
      idx <- which(resid(mod) < quantile(resid(mod), 0.5))
      if(robustboot){
        boot_idx <- sample(x = idx, size = n, replace = TRUE)       
      } else{
        boot_idx <- sample(x = 1:n, size = n, replace = TRUE)         
      }
      boot_dat <- data[boot_idx, ]
      if (requireNamespace("mgcv", quietly = TRUE)) {
        mgcv::gam(form, data = boot_dat)  
      } else {
        stop("Package 'mgcv' is required for this function.")
      }
    }
    sdev <- summary(mod)$scale
  } else {
    cat("----\n")
    stop(paste("Method not supported, \n   choose one of", paste(supportedMethods, collapse = ", "), "\n----\n"))
  }
  
  
  # beta <- coef(mod)
  # V <- vcov(mod)
  # Cv <- chol(V)
  # 
  # nus <- rnorm(m * length(beta))
  # beta_sims <- beta + 
  #   t(Cv) %*% 
  #   matrix(nus, nrow = length(beta), ncol = m)
  # # covar_sim <- predict(mod, newdata = boot_dat, type = "lpmatrix")
  pred <-  predict(mod, newdata = data)
  # linpred_sim <- covar_sim %*% beta_sims
  # invlink <- function(x) x
  # exp_val_sim <- invlink(linpred_sim)
  if(uncert == "normalerror"){
    ymiss <- pred[index] + rnorm(n = sum(index), 
                                 mean = 0,#exp_val_sim, 
                                 sd = sdev)
  }
  if(uncert == "resid"){
    ymiss <- pred[index] + sample(resid(mod), size = sum(index))   
  }
  if(uncert == "wresid"){
    ymiss <- rep(NA, sum(index))
    cnt <- 0
    for(i in (1:n)[index]){
      cnt <- cnt + 1
      # d <- pdist::pdist(as.matrix(data[i, x_vars, drop = FALSE]), 
      #                   as.matrix(data[, x_vars, drop = FALSE]))
      mod_matrix <- as.matrix(model.matrix(mf, data = data))
      my_distance_function <- function(x, y) {
        if (!requireNamespace("pdist", quietly = TRUE)) {
          stop("Package 'pdist' must be installed to use this function.")
        }
        d <- pdist::pdist(mod_matrix[i, ,drop = FALSE], 
                                 mod_matrix) 
      }
      d <- 1 - d@dist / max(d@dist)
      d <- sqrt(d)
      r <- resid(mod)
      p <- pred[i]
      ymiss[cnt] <- pred[i] + 
        sample(resid(mod), 
               size = 1, prob = d)     
    }
  }
  if(uncert == "pmm"){
    selectDonor <- function(y, val){
      donors <- numeric(length(val))
      for(i in 1:length(val)){
        # donors[i] <- sample(y[sort(abs(val - y), index.return  = TRUE)$ix 
        #                       %in% 1:5], 1)
        #donors[i] <- sample(y[sort(dist(c(val[i], y))[1:sum(index)], index.return  = TRUE)$ix 
        #                      %in% 1:5], 1)
        #donors[i] <- sample(y[sort(pdist::pdist(matrix(val[i], ncol = 1), matrix(y, ncol = 1))@dist,
        #                          index.return  = TRUE)$ix %in% 1:5], 1)
        my_distance_function <- function(x, y) {
          if (!requireNamespace("pdist", quietly = TRUE)) {
            stop("Package 'pdist' must be installed to use this function.")
          }
          donors[i] <- sample(y[order(pdist::pdist(matrix(val[i], ncol = 1), matrix(y, ncol = 1))@dist)[1:5]], 1)        
          
        }

      }
      donors 
    }
    residuals <- selectDonor(na.omit(y), val = pred[index]) - pred[index] 
    ymiss <- pred[index] + residuals  
    
  }
  value_back <- "ymiss" # currently the only option
  
  # backtransform to original scale
  trans <- getTransformation(ynam)
  yname_orig <- getOrigName(ynam)
  data[index, yname_orig] <- ymiss
  ymiss <- backtransform(ynam, ymiss) 
  
  if(value_back == "ymiss"){
    return(ymiss)
  } else{
    rownames(data) <- rn
    return(data) 
  }
}

Try the VIM package in your browser

Any scripts or data that you put into this service are public.

VIM documentation built on Jan. 10, 2026, 9:13 a.m.