R/VIM_kNN.R

Defines functions kNN check_data probCat dist_single lengthL

lengthL <- function(x) {
  if (is.list(x)) {
    return(sapply(x, length))
  } else {
    return(length(x))
  }
}

dist_single <- function(don_dist_var, imp_dist_var, numericalX, factorsX,
                        ordersX, mixedX, levOrdersX, don_index, imp_index,
                        weightsx, k, mixed.constant, provideMins = TRUE) {
  # gd <- distance(don_dist_var,imp_dist_var,weights=weightsx)
  if (is.null(mixed.constant)) {
    mixed.constant <- rep(0, length(mixedX))
  }

  if (provideMins) {
    gd <- VIM::gowerD(don_dist_var, imp_dist_var,
      weights = weightsx, numericalX,
      factorsX, ordersX, mixedX, levOrdersX, mixed.constant = mixed.constant,
      returnIndex = TRUE,
      nMin = as.integer(k), returnMin = TRUE
    )
    colnames(gd$mins) <- imp_index
    erg2 <- as.matrix(gd$mins)
  } else {
    gd <- VIM::gowerD(don_dist_var, imp_dist_var,
      weights = weightsx, numericalX,
      factorsX, ordersX, mixedX, levOrdersX, mixed.constant = mixed.constant,
      returnIndex = TRUE,
      nMin = as.integer(k)
    )
    erg2 <- NA
  }
  colnames(gd$ind) <- imp_index
  gd$ind[, ] <- don_index[gd$ind]
  erg <- as.matrix(gd$ind)

  if (k == 1) {
    erg <- t(erg)
    erg2 <- t(erg2)
  }
  list(erg, erg2)
}

probCat <- function(x, col_names, weights = NULL) {
  # return the probability for each category
  df <- data.frame(matrix(rep(0, length(col_names)), nrow = 1))
  colnames(df) <- col_names
  is_logical <- is.logical(x)
  if (!is.factor(x)) {
    x <- as.factor(x)
  }
  s <- summary(x)
  if (!is.null(weights)) {
    tmpTab <- merge(
      stats::aggregate(weights, list(x), sum),
      data.frame("Group.1" = names(s), prob = s)
    )
    s <- tmpTab$prob * tmpTab$x
    names(s) <- tmpTab$Group.1
  }
  df[names(s)] <- s / sum(s)
  return(df)
}

check_data <- function(data) {
  if (inherits(data, "survey.design")) {
    stop(
      "support for survey.design objects was removed in VIM",
      " 6.0.0. Use data.frame or data.table instead"
    )
  }
}

#' kNN
#' @description
#' k-Nearest Neighbour Imputation based on a variation of the Gower Distance
#' for numerical, categorical, ordered and semi-continous variables.
#' The original function is kNN in package VIM by Alexander Kowarik and
#' Statistik Austria. Here only the difference will be explained. In \code{kNN},
#' not only the imputed result will be returned, but also the disjunctive
#' imputed result. For each observation of one categorical column, the value of
#' the Nearest Neighbors will be recorded, and with this values, the probability
#' vector for each category is constructed.
#' @param data data.frame or matrix
#' @param variable variables where missing values should be imputed
#' @param metric metric to be used for calculating the distances between
#' @param k number of Nearest Neighbours used
#' @param dist_var names or variables to be used for distance calculation
#' @param weights weights for the variables for distance calculation.
#' If `weights = "auto"` weights will be selected based on variable importance
#' from random forest regression, using function [ranger::ranger()].
#' Weights are calculated for each variable seperately.
#' @param numFun function for aggregating the k Nearest Neighbours in the case
#' of a numerical variable
#' @param catFun function for aggregating the k Nearest Neighbours in the case
#' of a categorical variable
#' @param makeNA list of length equal to the number of variables, with values,
#' that should be converted to NA for each variable
#' @param NAcond list of length equal to the number of variables, with a
#' condition for imputing a NA
#' @param impNA TRUE/FALSE whether NA should be imputed
#' @param donorcond condition for the donors e.g. list(">5"), must be NULL or
#' a list of same length as variable
#' @param trace TRUE/FALSE if additional information about the imputation
#' process should be printed
#' @param imp_var TRUE/FALSE if a TRUE/FALSE variables for each imputed
#' variable should be created show the imputation status
#' @param imp_suffix suffix for the TRUE/FALSE variables showing the imputation
#' status
#' @param addRF TRUE/FALSE each variable will be modelled using random forest
#' regression ([ranger::ranger()]) and used as additional distance variable.
#' @param onlyRF TRUE/FALSE if TRUE only additional distance variables created
#' from random forest regression will be used as distance variables.
#' @param addRandom TRUE/FALSE if an additional random variable should be added
#' for distance calculation
#' @param mixed names of mixed variables
#' @param mixed.constant vector with length equal to the number of
#' semi-continuous variables specifying the point of the semi-continuous
#' distribution with non-zero probability
#' @param useImputedDist TRUE/FALSE if an imputed value should be used for
#' distance calculation for imputing another variable.
#' Be aware that this results in a dependency on the ordering of the variables.
#' @param weightDist TRUE/FALSE if the distances of the k nearest neighbors
#' should be used as weights in the
#' aggregation step
#' @param col_cat Column indices for categorical columns
#' @return \code{ximp} The imputed data set.
#' @return \code{ximp.disj} The imputed data set with categorical columns in
#' one-hot form.
#' @return \code{R.mask} The indicator matrix of missingness/imputation.
#' @references A. Kowarik, M. Templ (2016) Imputation with
#' R package VIM.  *Journal of
#' Statistical Software*, 74(7), 1-16.
#' @export
#' @importFrom data.table `:=` .SD
kNN <- function(data, variable = colnames(data), metric = NULL, k = 5,
                dist_var = colnames(data), weights = NULL,
                numFun = stats::median, catFun = VIM::maxCat, makeNA = NULL,
                NAcond = NULL, impNA = TRUE, donorcond = NULL, mixed = vector(),
                mixed.constant = NULL, trace = FALSE, imp_var = TRUE,
                imp_suffix = "imp", addRF = FALSE, onlyRF = FALSE,
                addRandom = FALSE, useImputedDist = TRUE, weightDist = FALSE,
                col_cat = c()) {
  check_data(data)
  is_ranger_package_installed()
  is_VIM_package_installed()
  is_data.table_package_installed()
  data_df <- !data.table::is.data.table(data)
  ## Add:
  exist_cat <- !all(c(0, col_cat) == c(0))
  if (exist_cat) {
    name_cat <- colnames(data)[col_cat]
    # Deal with the problem that nlevels(df[[col]]) > length(unique(df[[col]]))
    for (col in name_cat) {
      data[[col]] <- factor(as.character(data[[col]]))
    }
    # remember the levels for each categorical column
    dict_lev <- dict_level(data, col_cat)
    # preserve colnames for ximp.disj
    dummy <- dummyVars(" ~ .", data = data, sep = "_")
    col_names.disj <- colnames(data.frame(predict(dummy, newdata = data)))
    # represent the factor columns with their ordinal levels
    data <- factor_ordinal_encode(data, col_cat)
    # Create dict_cat with categroical columns
    dict_cat <- dict_onehot(data, col_cat)
  }

  ## Add: data.disj
  dummy <- dummyVars(" ~ .", data = data, sep = "_")
  data.disj <- data.frame(predict(dummy, newdata = data))
  # check for colnames before forcing variable
  if (is.null(colnames(data))) {
    colnames(data) <- colnames(data, do.NULL = FALSE)
  }
  force(variable)
  force(dist_var)
  if (data_df) {
    data <- data.table::as.data.table(data)
  } else {
    data <- data.table::copy(data)
  }
  # basic checks
  if (!is.null(mixed.constant)) {
    if (length(mixed.constant) != length(mixed)) {
      stop("length 'mixed.constant' and length 'mixed' differs")
    }
  }
  startTime <- Sys.time()
  nvar <- length(variable)
  ndat <- nrow(data)
  # impNA==FALSE -> NAs should remain NAs (Routing NAs!?)
  indexNAs <- is.na(data)
  if (!is.null(donorcond)) {
    if (length(donorcond) != nvar) {
      stop("The list 'donorcond' must have the same length as the
           'variable' vector")
    }
  }
  if (!is.null(makeNA)) {
    if (length(makeNA) != nvar) {
      stop("The vector 'variable' must have the same length as the
           'makeNA' list")
    } else {
      for (i in 1:nvar) {
        data[data[, sapply(.SD, function(x) x %in% makeNA[[i]])[, 1],
          .SDcols = variable[i]
        ], variable[i] := NA] # ,with=FALSE]
      }
    }
    if (!impNA) {
      indexNA2s <- is.na(data) & !indexNAs
    } else {
      indexNA2s <- is.na(data)
    }
  } else {
    indexNA2s <- is.na(data)
  }
  if (sum(indexNA2s) <= 0) {
    warning("Nothing to impute, because no NA are present
            (also after using makeNA)")
    invisible(data)
  }
  if (imp_var) {
    imp_vars <- paste(variable, "_", imp_suffix, sep = "")
    index_imp_vars <- which(!imp_vars %in% colnames(data))
    index_imp_vars2 <- which(imp_vars %in% colnames(data))
    if (length(index_imp_vars) > 0) {
      data[, imp_vars[index_imp_vars] := FALSE] # ,with=FALSE]
      for (i in index_imp_vars) {
        data[indexNA2s[, variable[i]], imp_vars[i] := TRUE]
        # if(!any(indexNA2s[,variable[i]]))
        # data<-data[,-which(names(data)==paste(variable[i],"_",
        # imp_suffix,sep=""))]
      }
    }
    if (length(index_imp_vars2) > 0) {
      warning(paste(
        "The following TRUE/FALSE imputation status variables will be updated:",
        paste(imp_vars[index_imp_vars2], collapse = " , ")
      ))
      for (i in index_imp_vars2) {
        data[, imp_vars[i] := as.logical(data[, imp_vars[i]])]
      } # ,with=FALSE]
    }
  }
  for (v in variable) {
    if (data[, sapply(.SD, function(x) all(is.na(x))), .SDcols = v]) {
      warning(paste("All observations of", v, "are missing, therefore the
                    variable will not be imputed!\n"))
      variable <- variable[variable != v]
    }
  }
  if (length(variable) == 0) {
    warning(paste("Nothing is imputed, because all variables to be imputed
                  only contains missings."))
    if (data_df) {
      data <- as.data.frame(data)
    }
    return(data)
  }
  orders <- data[, sapply(.SD, is.ordered)]
  orders <- colnames(data)[orders]
  levOrders <- vector()
  if (length(orders) > 0) {
    levOrders <- data[, sapply(.SD, function(x) length(levels(x))),
      .SDcols = orders
    ]
  }
  factors <- data[, sapply(.SD, function(x) {
    is.factor(x) | is.character(x) | is.logical(x)
  })]
  factors <- colnames(data)[factors]
  factors <- factors[!factors %in% orders]

  numerical <- data[, sapply(.SD, function(x) is.numeric(x) | is.integer(x))]
  numerical <- colnames(data)[numerical]
  numerical <- numerical[!numerical %in% mixed]
  if (trace) {
    message("Detected as categorical variable:\n")
    message(paste(factors, collapse = ","))
    message("Detected as ordinal variable:\n")
    message(paste(orders, collapse = ","))
    message("Detected as numerical variable:\n")
    message(paste(numerical, collapse = ","))
  }

  ### Make an index for selecting donors
  INDEX <- 1:ndat
  ## START DISTANCE IMPUTATION
  ## if(is.null(metric))
  ##   metric <- c("euclidean", "manhattan", "gower")
  ## else if(!metric%in%c("euclidean", "manhattan", "gower"))
  ##   stop("metric is unknown")

  # add features using random forest (ranger)
  if (addRF) {
    features_added <- c()
    dist_var_new <- list()
    weights_new <- list()
    # create data set without missings for regressors
    # seems to be most efficient way
    # can still be improved...?
    dataRF <- suppressWarnings(VIM::kNN(data[, unique(c(
      unlist(dist_var),
      variable
    )), with = FALSE], imp_var = FALSE))

    for (i in 1:nvar) {
      if (any(indexNA2s[, variable[i]])) {
        if (is.list(dist_var)) {
          dist_var_cur <- dist_var[[i]]
        } else {
          dist_var_cur <- dist_var
        }
        regressors <- dist_var_cur[dist_var_cur != variable[i]]
        index.miss <- data[is.na(get(variable[i])), which = TRUE]

        data.mod <- dataRF[-c(index.miss), unique(c(dist_var_cur, variable[i])),
          with = FALSE
        ]

        if (nrow(data.mod) == 0) {
          warning(
            "cannot use random forest for ", variable[i],
            "\n too many missing values in the data"
          )
          next
        }
        ranger.formula <- as.formula(paste(variable[i],
          paste(regressors, collapse = "+"),
          sep = "~"
        ))
        class_data.mod <- sapply(data.mod, function(x) class(x)[1])
        if ("character" %in% class_data.mod) {
          for (cn in colnames(data.mod)[class_data.mod == "character"]) {
            data.mod[[cn]] <- as.factor(data.mod[[cn]])
          }
        }
        ranger.mod <- ranger::ranger(ranger.formula, data = data.mod)

        new_feature <- c(paste0(variable[i], "randomForestFeature"))
        data[, c(new_feature) := predict(ranger.mod, data = dataRF)$predictions]

        features_added <- c(features_added, new_feature)

        if (variable[i] %in% mixed) {
          mixed <- c(mixed, new_feature)
        } else if (variable[i] %in% numerical) {
          numerical <- c(numerical, new_feature)
        } else if (variable[i] %in% orders) {
          orders <- c(orders, new_feature)
        } else if (variable[i] %in% factors) {
          factors <- c(factors, new_feature)
        }

        if (onlyRF) {
          dist_var_new[[i]] <- c(new_feature)
        } else {
          dist_var_new[[i]] <- c(dist_var_cur, new_feature)
          if (!is.null(weights) && weights[1] != "auto") {
            if (is.list(weights)) {
              weights_new[[i]] <- c(weights[[i]], stats::median(weights[[i]]))
            } else {
              weights_new[[i]] <- c(weights, stats::median(weights))
            }
          }
        }
      }
    }
    rm(dataRF)
    # create sets for distance variables
    dist_var <- dist_var_new
    if (!is.null(weights) && weights[1] != "auto") {
      weights <- weights_new
    }
  } else {
    if (onlyRF) {
      onlyRF <- FALSE
      warning("The onlyRF is automatically set to FALSE, because addRF=FALSE.")
    }
    features_added <- NULL
  }
  # set weights vector
  if (is.null(weights)) {
    if (is.list(dist_var)) {
      weights <- lapply(dist_var, function(z) {
        rep(1, length(z))
      })
    } else {
      weights <- rep(1, length(dist_var))
    }
  } else if (weights[1] == "auto") {
    # use random forest and importance values for automatic weighting
    # setup dist_var and weights as lists
    # for each model different weights
    weights_new <- list()
    dist_var_new <- list()

    for (i in 1:nvar) {
      if (any(indexNA2s[, variable[i]])) {
        if (is.list(dist_var)) {
          regressors <- dist_var[[i]][dist_var != variable[i]]
          data.mod <- stats::na.omit(
            subset(data, select = unique(c(variable[i], dist_var[[i]])))
          )
        } else {
          regressors <- dist_var[dist_var != variable[i]]
          data.mod <- stats::na.omit(
            subset(data, select = unique(c(variable[i], dist_var)))
          )
        }

        ranger.formula <- as.formula(
          paste(variable[i], paste(regressors, collapse = "+"), sep = "~")
        )
        ranger.mod <- ranger::ranger(
          ranger.formula,
          data = data.mod, importance = "impurity"
        )
        dist_var_new[[i]] <- regressors
        weights_new[[i]] <- ranger::importance(ranger.mod)
      }
    }
    weights <- weights_new
    dist_var <- dist_var_new
    rm(weights_new, dist_var_new)
  } else if (any(lengthL(weights) != lengthL(dist_var))) {
    stop("length of weights must be equal the number of distance variables")
  }
  if (addRandom) {
    numerical <- c(numerical, "RandomVariableForImputation")
    data[, "RandomVariableForImputation" := stats::rnorm(ndat)] # ,with=FALSE]
    if (is.list(dist_var)) {
      for (i in 1:length(dist_var)) {
        dist_var[[i]] <- c(dist_var[[i]], "RandomVariableForImputation")
        weights[[i]] <- c(
          weights[[i]], min(weights[[i]]) / (sum(weights[[i]]) + 1)
        )
      }
    } else {
      dist_var <- c(dist_var, "RandomVariableForImputation")
      weights <- c(weights, min(weights) / (sum(weights) + 1))
    }
  }
  for (j in 1:nvar) {
    if (any(indexNA2s[, variable[j]])) {
      if (is.list(dist_var)) {
        if (!is.list(weights)) {
          stop("if dist_var is a list weights must be a list")
        }
        dist_varx <- dist_var[[j]]
        weightsx <- weights[[j]]
      } else {
        dist_varx <- dist_var[dist_var != variable[j]]
        weightsx <- weights[dist_var %in% dist_varx]
      }
      if (!is.null(donorcond)) {
        cmd <- paste0(
          "TF <- data[,sapply(.SD,function(x)!is.na(x)&x",
          donorcond[[j]], "),.SDcols=variable[j]][,1]"
        )
        eval(parse(text = cmd))
        don_dist_var <- data[TF, dist_varx, with = FALSE]
        don_index <- INDEX[TF]
      } else {
        TF <- data[, sapply(.SD, function(x) !is.na(x)),
          .SDcols = variable[j]
        ][, 1]
        don_dist_var <- data[TF, dist_varx, with = FALSE]
        don_index <- INDEX[TF]
      }
      TF_imp <- indexNA2s[, variable[j]]
      imp_dist_var <- data[TF_imp, dist_varx, with = FALSE]
      imp_index <- INDEX[TF_imp]

      #
      if (!useImputedDist && any(dist_varx %in% variable)) {
        for (dvar in dist_varx[dist_varx %in% variable]) {
          ## setting the value for original missing variables to NA
          don_dist_var[indexNA2s[TF, dvar], c(dvar) := NA] # ,with=FALSE]
          imp_dist_var[indexNA2s[TF_imp, dvar], c(dvar) := NA] # ,with=FALSE]
        }
      }

      numericalX <- numerical[numerical %in% dist_varx]
      factorsX <- factors[factors %in% dist_varx]
      ordersX <- orders[orders %in% dist_varx]
      levOrdersX <- levOrders[orders %in% dist_varx]
      # print(levOrdersX)
      mixedX <- mixed[mixed %in% dist_varx]
      # dist_single provide the rows of the k nearest neighbours and the
      # corresponding distances
      mindi <- dist_single(as.data.frame(don_dist_var),
        as.data.frame(imp_dist_var),
        numericalX, factorsX, ordersX, mixedX, levOrdersX,
        don_index, imp_index, weightsx, k, mixed.constant,
        provideMins = weightDist
      )
      getI <- function(x) data[x, variable[j], with = FALSE]
      if (trace) {
        message(
          sum(indexNA2s[, variable[j]]),
          "items of", "variable:", variable[j], " imputed\n"
        )
      }
      # Fetching the actual values of the kNNs for the indices provided by
      # dist_single
      getI <- function(x) data[x, variable[j], with = FALSE]
      kNNs <- do.call("cbind", apply(mindi[[1]], 2, getI))
      if (k == 1) {
        kNNs <- t(kNNs)
      }

      if (weightDist & k > 1) {
        if (length(factors) < length(variable) & !"weights" %in% names(
          as.list(args(numFun))
        )) {
          warning("There is no explicit 'weights' argument in your numeric
                  aggregation function.")
        }
        if (length(factors) > 0 && !"weights" %in% names(
          as.list(args(catFun))
        )) {
          warning("There is no explicit 'weights' argument in your
                  categorical aggregation function.")
        }
        # 1-dist because dist is between 0 and 1
        mindi[[2]] <- 1 - mindi[[2]]
        ### warning if there is no argument named weights
        if (variable[j] %in% factors) {
          data[indexNA2s[, variable[j]], variable[j]] <- sapply(
            1:ncol(kNNs),
            function(x) {
              do.call("catFun", list(
                unlist(kNNs[, x, with = FALSE]),
                mindi[[2]][, x]
              ))
            }
          )
          ### Add the disj result###
          tmp <- sapply(1:ncol(kNNs), function(x) {
            do.call(
              "probCat", list(
                unlist(kNNs[, x, with = FALSE]),
                levels(data[[variable[j]]]), mindi[[2]][, x]
              )
            )
          })
          df_tmp <- data.frame(t(data.frame(tmp)))
          data.disj[indexNA2s[, variable[j]], dict_cat[[variable[j]]]] <- df_tmp
          #########################
        } else if (is.integer(data[, variable[j]])) {
          data[indexNA2s[, variable[j]], variable[j]] <-
            round(sapply(1:ncol(kNNs), function(x) {
              do.call(
                "numFun", list(
                  unlist(kNNs[, x, with = FALSE]),
                  mindi[[2]][, x]
                )
              )
            }))
          data.disj[indexNA2s[, variable[j]], variable[j]] <-
            round(sapply(1:ncol(kNNs), function(x) {
              do.call(
                "numFun", list(
                  unlist(kNNs[, x, with = FALSE]),
                  mindi[[2]][, x]
                )
              )
            }))
        } else {
          data[indexNA2s[, variable[j]], variable[j]] <-
            sapply(1:ncol(kNNs), function(x) {
              do.call(
                "numFun", list(unlist(kNNs[, x, with = FALSE]), mindi[[2]][, x])
              )
            })
          data.disj[indexNA2s[, variable[j]], variable[j]] <-
            sapply(1:ncol(kNNs), function(x) {
              do.call(
                "numFun", list(unlist(kNNs[, x, with = FALSE]), mindi[[2]][, x])
              )
            })
        }
      } else {
        if (variable[j] %in% factors) {
          data[indexNA2s[, variable[j]], variable[j]] <- apply(kNNs, 2, catFun)
          ### Add the disj result###
          tmp <- apply(kNNs, 2, probCat, levels(data[[variable[j]]]))
          df_tmp <- data.frame(matrix(unlist(tmp),
            nrow = length(tmp),
            byrow = TRUE
          ))
          data.disj[indexNA2s[, variable[j]], dict_cat[[variable[j]]]] <- df_tmp
          #########################
        } else if (is.integer(data[, variable[j]])) {
          data[indexNA2s[, variable[j]], variable[j]] <- round(apply(
            kNNs, 2,
            numFun
          ))
          data.disj[indexNA2s[, variable[j]], variable[j]] <- round(
            apply(kNNs, 2, numFun)
          )
        } else {
          data[indexNA2s[, variable[j]], variable[j]] <- apply(kNNs, 2, numFun)
          data.disj[indexNA2s[, variable[j]], variable[j]] <- apply(
            kNNs, 2,
            numFun
          )
        }
      }
    } else {
      if (trace) {
        message("0 items of", "variable:", variable[j], " imputed\n")
      }
    }
  }
  if (trace) {
    print(difftime(Sys.time(), startTime))
  }
  if (addRandom) {
    RandomVariableForImputation <- NULL # for satisfying CRAN check
    data <- data[, RandomVariableForImputation := NULL]
  }
  if (!is.null(features_added)) {
    data[, c(features_added) := NULL]
  }
  if (data_df) {
    data <- as.data.frame(data)
  }
  ## Add: change return form
  data <- data.frame(data)
  colnum <- length(colnames(data))
  dataimp <- data[, 1:(colnum / 2)]
  R.mask <- data[, (colnum / 2 + 1):colnum]
  for (col in colnames(data.disj)) {
    data.disj[[col]] <- unlist(data.disj[[col]])
  }
  # return to original levels
  if (exist_cat) {
    for (col in name_cat) {
      levels(dataimp[[col]]) <- dict_lev[[col]]
    }
    colnames(data.disj) <- col_names.disj
  }
  row.names(dataimp) <- row.names(data.disj)
  return(list(ximp = dataimp, ximp.disj = data.disj, R.mask = R.mask))
}
adimajo/MissImp documentation built on April 5, 2022, 6:32 a.m.