R/impute.R
In biokNN: Bi-Objective k-Nearest Neighbors Imputation for Multilevel Data

Documented in biokNN.impute biokNN.impute.mi calibrate

#' Impute multilevel dataset
#'
#' This function returns a dataframe with a complete dataset, where the missing values are imputed
#' using a bi-objective kNN method. It assumes that the class variable is complete and its name is known, and the rest
#' of the variables are numerical.
#'
#' @param data A dataframe with missing values
#' @param className name of the variable that contains the classes
#' @param nIter number of iterations, default = 10
#' @param weight weight of the kNN values in the objective function, default = 0.5
#' @param k number of nearest neighbours, default = 10
#' @param distance distance function used to get the k-nearest neighbors
#' @return A dataframe with the imputed data
#' @export
#'
#' @examples
#' data(data.example)
#' complete.data <- biokNN.impute(data.example,
#'                className = "class",
#'                nIter = 10,
#'                weight = 0.9,
#'                k = 15,
#'                distance = "gower")
biokNN.impute <- function(data, className, nIter = 10, weight = 0.5, k = 10, distance = "gower"){
  df_miss <- data
  if(check.data(df_miss, className)){
    df_miss <- dplyr::select(df_miss, className, dplyr::everything())
    vars <- ncol(df_miss)
    rows <- nrow(df_miss)
    neighbors <- matrix(nrow = rows, ncol = k)
    index_miss <- get.index.miss(df_miss)

    M <- mice::complete(mice::mice(df_miss, meth = "sample", m = 1, print = FALSE))
    clusters <- stats::setNames(as.matrix(M[, 1]), 1:rows)

    iter <-1
    while(iter < nIter){
      dist <- as.matrix(cluster::daisy(M, metric = distance))
      neighbors <- get.neighbors(as.matrix(dist), k)
      clusters <- stats::setNames(as.matrix(M[, 1]), 1:rows)

      for(j in 2:vars){
        for(i in index_miss[, j]){
          if(i != 0){
            clust_val <- clusters[i]
              sum_kNN <- sum(M[neighbors[i,], j])
              has_neigh <- has.x.as.neighbor(i, neighbors)
              sum_neigh <- sum(M[has_neigh, j])
              m_knn <- (sum_kNN + sum_neigh)/(k + length(has_neigh))
              m_cluster <- mean(M[as.numeric(names(clusters[clusters==clust_val])), j])
              M[i, j] <- weight*m_knn + (1-weight)*m_cluster
          }
        }
      }
      iter <- iter + 1
    }
    M
  } else {
    print("The dataframe is not in the correct format.")
  }
}


#' Multiple imputation for a multilevel dataset
#'
#' This function returns a list of m complete datasets, where the missing values are imputed
#' using a bi-objective kNN method. It assumes that the class variable name is known, and the rest
#' of the variables are numerical.
#'
#' @param data A dataframe with missing values
#' @param className name of the variable that contains the classes
#' @param m number of imputations
#' @param nIter number of iterations, default = 10
#' @param weight weight of the kNN values in the objective function, default = 0.5
#' @param k number of nearest neighbours, default = 10
#' @param distance distance function used to get the k-nearest neighbors
#' @return A dataframe with the imputed data
#' @export
#'
#' @examples
#' data(data.example)
#' complete.data.mi <- biokNN.impute.mi(data.example,
#'                className = "class",
#'                m = 3,
#'                nIter = 10,
#'                weight = 0.9,
#'                k = 15,
#'                distance = "gower")
#' # View completed data sets
#' str(complete.data.mi)
biokNN.impute.mi <- function(data, className, m =5, nIter = 10, weight = 0.5, k = 10, distance = "gower"){
  df_miss <- data
  if(check.data(df_miss, className)){

    MI <- vector(mode = "list", length = m)
    for(t in 1:m){
      df_miss <- dplyr::select(df_miss, className, dplyr::everything())
      vars <- ncol(df_miss)
      rows <- nrow(df_miss)
      neighbors <- matrix(nrow = rows, ncol = k)
      index_miss <- get.index.miss(df_miss)

      M <- mice::complete(mice::mice(df_miss, meth = "sample", m = 1, print = FALSE))
      clusters <- stats::setNames(as.matrix(M[, 1]), 1:rows)

      iter <-1
      while(iter < nIter){
        dist <- as.matrix(cluster::daisy(M, metric = distance))
        neighbors <- get.neighbors(as.matrix(dist), k)
        clusters <- stats::setNames(as.matrix(M[, 1]), 1:rows)

        for(j in 2:vars){
          for(i in index_miss[, j]){
            if(i != 0){
              clust_val <- clusters[i]
              sum_kNN <- sum(M[neighbors[i,], j])
              has_neigh <- has.x.as.neighbor(i, neighbors)
              sum_neigh <- sum(M[has_neigh, j])
              m_knn <- (sum_kNN + sum_neigh)/(k + length(has_neigh))
              m_cluster <- mean(M[as.numeric(names(clusters[clusters==clust_val])), j])
              M[i, j] <- weight*m_knn + (1-weight)*m_cluster
            }
          }
        }
        iter <- iter + 1
      }
      MI[[t]] <- M
    }
    MI
  } else {
    print("The dataframe is not in the correct format.")
  }
}


#' Calibrate parameters
#'
#' This function returns a vector with the two parameters requiered by the biokNN method
#' where the first value is the weighting parameter and the second the number of neighbors
#'
#' @param data A dataframe with missing values
#' @param className name of the variable that contains the classes
#' @param distance distance function used to get the k-nearest neighbors
#' @param nIter number of iterations, default = 10
#' @param prop_valid proportion of missing values
#' @param weight_space vector with the calibration values to test for the weight parameter
#' @param k_space vector with the calibration values to test for the number of neighbors
#' @param print option to print  the RMSE values of the parameters used for calibration (print = TRUE).
#' @return A dataframe with the imputed data
#' @export
#'
#' @examples
#' data(data.example)
#' calibrate(data.example,
#'           "class",
#'           prop_valid = 0.3,
#'           weight_space = c(0.5, 0.7, 0.9),
#'           k_space = c(10, 15),
#'           print = TRUE)
calibrate <- function(data, className, prop_valid = 0.1, nIter = 10, distance = "gower", weight_space = NULL, k_space = NULL, print = FALSE){
  df_miss <- data
  if(check.data(df_miss, className)){
    df_miss <- dplyr::select(df_miss, className, dplyr::everything())}

  if(is.null(weight_space)){
    alpha <- seq(0, 1, by = 0.1)
  } else {
    alpha <- weight_space
  }

  if(is.null(k_space)){
    k <- seq(5, 30, by =5)
  } else {
    k <- k_space
  }

  orig_pattern <- get.index.miss(df_miss)
  df_miss_valid <- make.missing(df_miss, make.pattern(df_miss, prop_valid))
  valid_pattern <- get.valid.pattern(orig_pattern, df_miss_valid)
  metrics <- matrix(ncol = length(k), nrow = length(alpha))

  j <- 1
  for(knn in k){
    i <- 1
    for(a in alpha){
      imp_val <- impute.multilevel.num.calibrate(df_miss_valid, distance, valid_pattern, nIter, a, knn)
      RMSE <- get.RMSE(df_miss, imp_val, valid_pattern)
      metrics[i, j] <- RMSE
      i <- i + 1
      if(print){
        print(paste0(a, ",", knn, ",", RMSE))
      }
    }
    j <- j + 1
  }
  best_ind <- which(metrics == min(metrics), arr.ind = TRUE)
  alpha_k <- c(alpha[best_ind[1]], k[best_ind[2]])
}


get.RMSE <- function(x, y, indx){
  RMSE_num <- 0
  vars_num <- 0
  nvar <- ncol(x)

  for(d in 2:(nvar)){
    for(i in indx[,d]){
      if(i != 0){
        RMSE_num <- RMSE_num + (x[i,d] - y[i,d])^2
        vars_num <- vars_num + 1
      }
    }
  }

  RMSE <- sqrt(RMSE_num/vars_num) #+ RMSE_cat/vars_cat)
  RMSE
}



#' @importFrom magrittr %>%
check.data <- function(df, className){
  check <- TRUE
  aux <- df  %>% dplyr::select(-className) %>% dplyr::select_if(~!is.numeric(.x))
  if(ncol(aux) > 1){
    check <- FALSE
  }
  check
}



get.neighbors <- function(dist.matrix, kNN){
  n <- nrow(dist.matrix)
  neigh <- matrix(nrow = n, ncol = kNN)
  for(i in 1:n){
    d <- sort(dist.matrix[i,])
    neigh[i,] <- as.numeric(names(d[2:(kNN+1)]))
  }
  neigh
}


has.x.as.neighbor <- function(i, neighbors){
  vec <- c()
  n <- nrow(neighbors)
  for(j in 1:n){
    if(i %in% neighbors[j,]){
      vec <- c(vec, j)
    }
  }
  vec
}


get.index.miss <- function(W){
  index_miss_vars_W <- matrix(NA, ncol = ncol(W), nrow = nrow(W))
  for(i in 1:ncol(W)){
    index_miss_vars_W[,i] <- 1:nrow(W)*is.na(W[,i])
  }
  index_miss_vars_W
}


make.missing <- function(data, pattern){
  vars <- ncol(data)
  for(i in 1:vars){
    data[pattern[ , i] == 1, i] <- NA
  }
  data
}


#' @importFrom stats rbinom
make.pattern <- function(data, p = 0.1){
  vars <- ncol(data) - 1
  prop <- p/vars
  pattern <- rep(0, nrow(data))
  for(i in 1:vars){
    pattern <- cbind(pattern, rbinom(nrow(data), 1, prop))
  }
  pattern
}


get.valid.pattern <- function(orig_pattern, df_miss_valid){
  n <- nrow(df_miss_valid)
  p <- ncol(df_miss_valid)
  index_miss_valid <- matrix(NA, ncol = p, nrow = n)
  index_miss_joint <- get.index.miss(df_miss_valid)

  for(j in 1:p){
    for(i in 1:n){
      orig <- orig_pattern[i,j]
      joint <- index_miss_joint[i,j]
      if(orig == 0 & joint !=0){
        index_miss_valid[i,j] <- joint
      } else {
        index_miss_valid[i,j] <- 0
      }
    }
  }
  index_miss_valid
}

impute.multilevel.num.calibrate <- function(df_miss, distance, pattern_val, nIter, alpha, k){

  vars <- ncol(df_miss)
  rows <- nrow(df_miss)

  neighbors <- matrix(nrow = rows, ncol = k)
  index_miss <- pattern_val

  M <- mice::complete(mice::mice(df_miss, meth = "sample", m = 1, print = FALSE))
  clusters <- stats::setNames(as.matrix(M[, 1]), 1:rows)

  iter <-1
  while(iter < nIter){
    dist <- as.matrix(cluster::daisy(M, metric = distance))
    neighbors <- get.neighbors(as.matrix(dist), k)
    clusters <- stats::setNames(as.matrix(M[, 1]), 1:rows)

    for(j in 2:vars){
      for(i in index_miss[, j]){
        if(i != 0){
          clust_val <- clusters[i]
          sum_kNN <- sum(M[neighbors[i,], j])
          has_neigh <- has.x.as.neighbor(i, neighbors)
          sum_neigh <- sum(M[has_neigh, j])
          m_knn <- (sum_kNN + sum_neigh)/(k + length(has_neigh))
          m_cluster <- mean(M[as.numeric(names(clusters[clusters==clust_val])), j])
          M[i, j] <- alpha*m_knn + (1-alpha)*m_cluster
        }
      }
    }
    iter <- iter + 1
  }
  M
}

Any scripts or data that you put into this service are public.

biokNN documentation built on April 22, 2021, 9:07 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

biokNN
Bi-Objective k-Nearest Neighbors Imputation for Multilevel Data

R/impute.R
In biokNN: Bi-Objective k-Nearest Neighbors Imputation for Multilevel Data

Defines functions impute.multilevel.num.calibrate get.valid.pattern make.pattern make.missing get.index.miss has.x.as.neighbor get.neighbors check.data get.RMSE calibrate biokNN.impute.mi biokNN.impute

Documented in biokNN.impute biokNN.impute.mi calibrate

Try the biokNN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

biokNN Bi-Objective k-Nearest Neighbors Imputation for Multilevel Data

R/impute.R In biokNN: Bi-Objective k-Nearest Neighbors Imputation for Multilevel Data

Defines functions impute.multilevel.num.calibrate get.valid.pattern make.pattern make.missing get.index.miss has.x.as.neighbor get.neighbors check.data get.RMSE calibrate biokNN.impute.mi biokNN.impute

Documented in biokNN.impute biokNN.impute.mi calibrate

Try the biokNN package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

biokNN
Bi-Objective k-Nearest Neighbors Imputation for Multilevel Data

R/impute.R
In biokNN: Bi-Objective k-Nearest Neighbors Imputation for Multilevel Data