Nothing
#' @title PipeOpSoftImpute
#' @name PipeOpSoftImpute
#'
#' @description
#' Implements SoftImpute methods as mlr3 pipeline, more about SoftImpute \code{\link{autotune_softImpute}}.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from \code{\link{PipeOpImpute}}.
#'
#'
#' @section Parameters:
#' The parameters include inherited from [`PipeOpImpute`], as well as: \cr
#' \itemize{
#' \item \code{id} :: \code{character(1)}\cr
#' Identifier of resulting object, default \code{"imput_softImpute"}.
#' \item \code{lambda} :: \code{integer(1)}\cr
#' Nuclear-norm regularization parameter. If lambda=0, the algorithm reverts to "hardImpute", for which convergence is typically slower. If NULL lambda is set automatically at the highest possible value, default \code{0}.
#' \item \code{rank.max} :: \code{integer(1)}\cr
#' This param restricts the rank of the solution. If set as NULL: rank.max=min(dim(X))-1, default \code{2}.
#' \item \code{type} :: \code{character(1)}\cr
#' Two algorithms are implemented: type="svd" or the default type="als". The "svd" algorithm repeatedly computes the svd of the completed matrix, and soft thresholds its singular values. Each new soft-thresholded svd is used to re-impute the missing entries. For large matrices of class "Incomplete", the svd is achieved by an efficient form of alternating orthogonal ridge regression. The "als" algorithm uses the same alternating ridge regression, but updates the imputation at each step, leading to quite substantial speedups in some cases. The "als" approach does not currently have the same theoretical convergence guarantees as the "svd" approach, default \code{'als'}.
#' \item \code{thresh} :: \code{double(1)}\cr
#' Threshold for convergence, default \code{1e-5}
#' \item \code{maxit} :: \code{integer(1)}\cr
#' Maximum number of iterations, default \code{100}.
#' \item \code{cat_Fun} :: \code{function(){}}\cr
#' Function for aggregating the k Nearest Neighbors in case of categorical variables. It can be any function with input=not_numeric_vector and output=atomic_object, default \code{VIM::maxCat}.
#' \item \code{out_fill} :: \code{character(1)}\cr
#' Output log file location. If file already exists log message will be added. If NULL no log will be produced, default \code{NULL}.
#' }
#'
#' @examples
#' {
#' graph <- PipeOpAmelia$new() %>>% mlr3learners::LearnerClassifGlmnet$new()
#' graph_learner <- GraphLearner$new(graph)
#'
#' # Task with NA
#'
#' resample(tsk("pima"), graph_learner, rsmp("cv", folds = 3))
#' }
#' @export
PipeOpSoftImpute <- R6::R6Class("softImpute_imputation",
lock_objects = FALSE,
inherit = PipeOpImpute, # inherit from PipeOp
public = list(
initialize = function(id = "impute_softImpute_B", cat_Fun = VIM::maxCat, lambda = 0, rank.max = 2, type = "als", thresh = 1e-5, maxit = 100,
out_file = NULL) {
super$initialize(id,
whole_task_dependent = TRUE, packages = "NADIA", param_vals = list(
cat_Fun = cat_Fun, lambda = lambda,
rank.max = rank.max, type = type, thresh = thresh, maxit = maxit,
out_file = out_file),
param_set = ParamSet$new(list(
"cat_Fun" = ParamUty$new("cat_Fun", default = VIM::maxCat, tags = "softImpute"),
"lambda" = ParamUty$new("lambda", default = 0, tags = "softImpute"),
"rank.max" = ParamUty$new("rank.max", default = 2, tags = "softImpute"),
"type" = ParamFct$new("type", levels = c("als", "svd"), default = "als", tags = "softImpute"),
"thresh" = ParamDbl$new("thresh", upper = Inf, lower = 0, default = 1e-5, tags = "softImpute"),
"maxit" = ParamDbl$new("maxit", upper = Inf, lower = 1, default = 100, tags = "softImpute"),
"out_file" = ParamUty$new("out_file", default = NULL, tags = "softImpute")
))
)
self$imputed <- FALSE
self$column_counter <- NULL
self$data_imputed <- NULL
}), private = list(
.train_imputer = function(feature, type, context) {
imp_function <- function(data_to_impute) {
data_to_impute <- as.data.frame(data_to_impute)
# prepering arguments for function
col_type <- 1:ncol(data_to_impute)
for (i in col_type) {
col_type[i] <- class(data_to_impute[, i])
}
percent_of_missing <- 1:ncol(data_to_impute)
for (i in percent_of_missing) {
percent_of_missing[i] <- (sum(is.na(data_to_impute[, i])) / length(data_to_impute[, 1])) * 100
}
col_miss <- colnames(data_to_impute)[percent_of_missing > 0]
col_no_miss <- colnames(data_to_impute)[percent_of_missing == 0]
data_imputed <- NADIA::autotune_softImpute(data_to_impute,
percent_of_missing = percent_of_missing, col_type = col_type,
cat_Fun = self$param_set$values$cat_Fun,
lambda = self$param_set$values$lambda, rank.max = self$param_set$values$rank.max,
type = self$param_set$values$type, thresh = self$param_set$values$thresh,
maxit = self$param_set$values$maxit, out_file = self$param_set$values$out_file)
return(data_imputed)
}
self$imputed_predict <- TRUE
self$flag <- "train"
if (!self$imputed) {
self$column_counter <- ncol(context) + 1
self$imputed <- TRUE
data_to_impute <- cbind(feature, context)
self$data_imputed <- imp_function(data_to_impute)
colnames(self$data_imputed) <- self$state$context_cols
}
if (self$imputed) {
self$column_counter <- self$column_counter - 1
}
if (self$column_counter == 0) {
self$imputed <- FALSE
}
self$train_s <- TRUE
self$action <- 3
return(list("data_imputed" = self$data_imputed, "train_s" = self$train_s, "flag" = self$flag, "imputed_predict" = self$imputed_predict, "imputed" = self$imputed, "column_counter" = self$column_counter))
},
.impute = function(feature, type, model, context) {
if (is.null(self$action)) {
self$train_s <- TRUE
self$flag <- "train"
self$imputed_predict <- TRUE
self$action <- 3
self$data_imputed <- model$data_imputed
self$imputed <- FALSE
self$column_counter <- 0
}
imp_function <- function(data_to_impute) {
data_to_impute <- as.data.frame(data_to_impute)
# prepering arguments for function
col_type <- 1:ncol(data_to_impute)
for (i in col_type) {
col_type[i] <- class(data_to_impute[, i])
}
percent_of_missing <- 1:ncol(data_to_impute)
for (i in percent_of_missing) {
percent_of_missing[i] <- (sum(is.na(data_to_impute[, i])) / length(data_to_impute[, 1])) * 100
}
data_imputed <- NADIA::autotune_softImpute(data_to_impute,
percent_of_missing = percent_of_missing, col_type = col_type,
cat_Fun = self$param_set$values$cat_Fun,
lambda = self$param_set$values$lambda, rank.max = self$param_set$values$rank.max,
type = self$param_set$values$type, thresh = self$param_set$values$thresh,
maxit = self$param_set$values$maxit, out_file = self$param_set$values$out_file)
return(data_imputed)
}
if (self$imputed) {
feature <- self$data_imputed[, setdiff(colnames(self$data_imputed), colnames(context))]
}
if ((nrow(self$data_imputed) != nrow(context) | !self$train_s) & self$flag == "train") {
self$imputed_predict <- FALSE
self$flag <- "predict"
}
if (!self$imputed_predict) {
data_to_impute <- cbind(feature, context)
self$data_imputed <- imp_function(data_to_impute)
colnames(self$data_imputed)[1] <- setdiff(self$state$context_cols, colnames(context))
self$imputed_predict <- TRUE
}
if (self$imputed_predict & self$flag == "predict") {
feature <- self$data_imputed[, setdiff(colnames(self$data_imputed), colnames(context))]
}
if (self$column_counter == 0 & self$flag == "train") {
feature <- self$data_imputed[, setdiff(colnames(self$data_imputed), colnames(context))]
self$flag <- "predict"
self$imputed_predict <- FALSE
}
self$train_s <- FALSE
return(feature)
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.