vimpute.R
In VIM: Visualization and Imputation of Missing Values

## ----include=FALSE------------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  fig.align = "center"
)

## ----echo = FALSE, results='hide', message=FALSE, warning=FALSE---------------
library(VIM)
library(data.table)

## ----setup_2, message = FALSE-------------------------------------------------
data <- as.data.table(VIM::sleep)
a <- aggr(sleep, plot = FALSE)
plot(a, numbers = TRUE, prop = FALSE)

## ----message = FALSE----------------------------------------------------------
dataDS <- sleep[, c("Dream", "Sleep")]
marginplot(dataDS, main = "Missing Values")

## ----include=TRUE, results='hide', message=FALSE, warning=FALSE---------------
result <- vimpute(
  data = data,
  pred_history = TRUE)

## -----------------------------------------------------------------------------
print(head(result$data, 3))

## -----------------------------------------------------------------------------
dataDS <- as.data.frame(result$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")])
marginplot(dataDS, delimiter = "_imp", main = "Imputation with Default Model")

## ----include=TRUE, results='hide', message=FALSE, warning=FALSE---------------
result_mixed <- vimpute(
  data = data,
  method = list(NonD = "robust", Dream = "ranger", Sleep = "xgboost", Span = "ranger" , Gest = "regularized"),
  pred_history = TRUE
  )

## -----------------------------------------------------------------------------
dataDS <- as.data.frame(result_mixed$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")])
marginplot(dataDS, delimiter = "_imp", main = "Imputation with different Models for each Variable")

## ----include=TRUE, results='hide', message=FALSE, warning=FALSE, echo=F-------
result_xgboost <- vimpute(
  data = data,
  method = setNames(as.list(rep("xgboost", ncol(data))), names(data)),
  pred_history = TRUE, verbose = FALSE
  )
dataDS_xgboost <- as.data.frame(result_xgboost$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")])
result_regularized <- vimpute(
  data = data,
  method = setNames(as.list(rep("regularized", ncol(data))), names(data)),
  pred_history = TRUE
  )
dataDS_regularized <- as.data.frame(result_regularized$data[, c("Dream", "Sleep", "Dream_imp", "Sleep_imp")])

## ----echo=F, warning=F--------------------------------------------------------
par(mfrow = c(1, 2))
marginplot(dataDS_xgboost, delimiter = "_imp", main = "Imputation with xgboost")
marginplot(dataDS_regularized, delimiter = "_imp", main = "Imputation with Regularized")
par(mfrow = c(1, 1))

## ----eval = FALSE-------------------------------------------------------------
# result <- vimpute(
#   data = data,
#   method = list(NonD = "robust",
#                 Dream = "ranger",
#                 Sleep = "xgboost",
#                 Span = "ranger" ,
#                 Gest = "regularized"),
#   pmm = list(NonD = FALSE, Dream = TRUE, Sleep = FALSE, Span = FALSE , Gest = TRUE)
#   )

## ----eval = FALSE-------------------------------------------------------------
# result <- vimpute(
#   data = data,
#   method = setNames(as.list(rep("regularized", ncol(data))), names(data))
#   formula = list(
#     NonD ~ Dream + Sleep,              # Linear combination
#     Span ~ Dream:Sleep + Gest,         # With interaction term
#     log(Gest) ~ Sleep + exp(Span)      # With transformations
#   )
# )

## ----eval = FALSE-------------------------------------------------------------
# result <- vimpute(
#   data = data,
#   tune = TRUE
#   )

## ----eval = FALSE-------------------------------------------------------------
# result <- vimpute(
#   data = data,
#   nseq = 20,
#   eps = 0.01
#   )

## ----eval = FALSE-------------------------------------------------------------
# result <- vimpute(
#   data = data,
#   imp_var = TRUE
#   )

## -----------------------------------------------------------------------------
print(tail(result$pred_history, 9))

## -----------------------------------------------------------------------------
library(reactable)

data(iris)
df <- as.data.table(iris)
colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species")
# randomly produce some missing values in the data
set.seed(1)
nbr_missing <- 50
y <- data.frame(row=sample(nrow(iris),size = nbr_missing,replace = T),
                col=sample(ncol(iris)-1,size = nbr_missing,replace = T))
y<-y[!duplicated(y),]
df[as.matrix(y)]<-NA

aggr(df)

## -----------------------------------------------------------------------------
sapply(df, function(x)sum(is.na(x)))

## ----results='hide', message=FALSE, warning=FALSE,include=FALSE---------------
library(reactable)

data(iris)
df <- as.data.table(iris)
colnames(df) <- c("S.Length","S.Width","P.Length","P.Width","Species")

# Create complete copy before introducing NAs
complete_data <- df

# Randomly produce missing values
set.seed(1)
nbr_missing <- 50
y <- data.frame(row = sample(nrow(df), size = nbr_missing, replace = TRUE),
                col = sample(ncol(df), size = nbr_missing, replace = TRUE))
y <- y[!duplicated(y),]
df[as.matrix(y)] <- NA

# Perform imputation
result <- vimpute(data = df, pred_history = TRUE)

# Extracting the imputed columns from result$data
imputed_columns <- grep("_imp$", names(result$data), value = TRUE)

# Create a function to compare true and imputed values
compare_values <- function(true_data, pred_data, imputed_data, col_name) {
  comparison <- data.frame(
    True_Value = true_data[[col_name]],
    Imputed_Value = ifelse(imputed_data, pred_data[[col_name]], NA)
  )
  comparison <- comparison[!is.na(comparison$Imputed_Value), ]
  return(comparison)
}

# Initialize an empty list to store the comparison tables
comparison_list <- list()

# Loop through each imputed column and create a comparison table
for (imputed_col in imputed_columns) {
  col_name <- sub("_imp$", "", imputed_col)
  comparison_list[[col_name]] <- compare_values(complete_data, result$data, result$data[[imputed_col]], col_name)
}

# Prepare the results in a combined wide format, ensuring equal row numbers
results <- cbind(
  "TRUE1" = head(comparison_list[["S.Length"]][, "True_Value"], 5),
  "IMPUTED1" = head(comparison_list[["S.Length"]][, "Imputed_Value"], 5),
  "TRUE2" = head(comparison_list[["S.Width"]][, "True_Value"], 5),
  "IMPUTED2" = head(comparison_list[["S.Width"]][, "Imputed_Value"], 5),
  "TRUE3" = head(comparison_list[["P.Length"]][, "True_Value"], 5),
  "IMPUTED3" = head(comparison_list[["P.Length"]][, "Imputed_Value"], 5),
  "TRUE4" = head(comparison_list[["P.Width"]][, "True_Value"], 5),
  "IMPUTED4" = head(comparison_list[["P.Width"]][, "Imputed_Value"], 5)
)

# Print the combined wide format table
print(results)

## ----echo=F,warning=F---------------------------------------------------------
# Load the reactable library
library(reactable)

# Create the reactable
reactable(results, columns = list(
  TRUE1 = colDef(name = "True"),
  IMPUTED1 = colDef(name = "Imputed"),
  TRUE2 = colDef(name = "True"),
  IMPUTED2 = colDef(name = "Imputed"),
  TRUE3 = colDef(name = "True"),
  IMPUTED3 = colDef(name = "Imputed"),
  TRUE4 = colDef(name = "True"),
  IMPUTED4 = colDef(name = "Imputed")
),
columnGroups = list(
  colGroup(name = "S.Length", columns = c("TRUE1", "IMPUTED1")),
  colGroup(name = "S.Width", columns = c("TRUE2", "IMPUTED2")),
  colGroup(name = "P.Length", columns = c("TRUE3", "IMPUTED3")),
  colGroup(name = "P.Width", columns = c("TRUE4", "IMPUTED4"))
),
striped = TRUE,
highlight = TRUE,
bordered = TRUE
)

## ----results='hide', message=FALSE, warning=FALSE,include=FALSE---------------
library(reactable)
library(VIM)
data(iris)

# Create complete copy before introducing NAs
complete_data <- iris
colnames(complete_data) <- c("S.Length","S.Width","P.Length","P.Width","Species")
df <- copy(complete_data)

# Randomly produce missing values
set.seed(1)
nbr_missing <- 50
y <- data.frame(row = sample(nrow(df), size = nbr_missing, replace = TRUE),
                col = sample(ncol(df), size = nbr_missing, replace = TRUE))
y <- y[!duplicated(y),]
df[as.matrix(y)] <- NA

# Perform imputation with proper method specification
result <- vimpute(
  data = df,
  method = setNames(lapply(names(df), function(x) "xgboost"),names(df)),
  pred_history = TRUE
)

# Extracting the imputed columns from result$data
imputed_columns <- grep("_imp$", names(result$data), value = TRUE)

# Create a function to compare true and imputed values
compare_values <- function(true_data, pred_data, imputed_data, col_name) {
  comparison <- data.frame(
    True_Value = true_data[[col_name]],
    Imputed_Value = ifelse(imputed_data, pred_data[[col_name]], NA)
  )
  comparison <- comparison[!is.na(comparison$Imputed_Value), ]
  return(comparison)
}

# Initialize an empty list to store the comparison tables
comparison_list <- list()

# Loop through each imputed column and create a comparison table
for (imputed_col in imputed_columns) {
  col_name <- sub("_imp$", "", imputed_col)
  comparison_list[[col_name]] <- compare_values(complete_data, result$data, result$data[[imputed_col]], col_name)
}

# Prepare the results in a combined wide format, ensuring equal row numbers
results <- cbind(
  "TRUE1" = head(comparison_list[["S.Length"]][, "True_Value"], 5),
  "IMPUTED1" = head(comparison_list[["S.Length"]][, "Imputed_Value"], 5),
  "TRUE2" = head(comparison_list[["S.Width"]][, "True_Value"], 5),
  "IMPUTED2" = head(comparison_list[["S.Width"]][, "Imputed_Value"], 5),
  "TRUE3" = head(comparison_list[["P.Length"]][, "True_Value"], 5),
  "IMPUTED3" = head(comparison_list[["P.Length"]][, "Imputed_Value"], 5),
  "TRUE4" = head(comparison_list[["P.Width"]][, "True_Value"], 5),
  "IMPUTED4" = head(comparison_list[["P.Width"]][, "Imputed_Value"], 5)
)

# Print the combined wide format table
print(results)

## ----echo=F,warning=F---------------------------------------------------------
# Load the reactable library
library(reactable)

# Create the reactable
reactable(results, columns = list(
  TRUE1 = colDef(name = "True"),
  IMPUTED1 = colDef(name = "Imputed"),
  TRUE2 = colDef(name = "True"),
  IMPUTED2 = colDef(name = "Imputed"),
  TRUE3 = colDef(name = "True"),
  IMPUTED3 = colDef(name = "Imputed"),
  TRUE4 = colDef(name = "True"),
  IMPUTED4 = colDef(name = "Imputed")
),
columnGroups = list(
  colGroup(name = "S.Length", columns = c("TRUE1", "IMPUTED1")),
  colGroup(name = "S.Width", columns = c("TRUE2", "IMPUTED2")),
  colGroup(name = "P.Length", columns = c("TRUE3", "IMPUTED3")),
  colGroup(name = "P.Width", columns = c("TRUE4", "IMPUTED4"))
),
striped = TRUE,
highlight = TRUE,
bordered = TRUE
)