Nothing
#' @title pgu.imputation
#'
#' @description
#' Analyses and substitutes imputation sites in a data set.
#'
#' @details
#' Analyses imputation sites in a data set.
#' Replaces imputation sites by missing values and substitutes NAs
#' by classical and ML-powered substitution algorithms.
#' This object is used by the shiny based gui and is not for use in individual R-scripts!
#'
#' @format [R6::R6Class] object.
#'
#' @importFrom dplyr all_of arrange bind_rows filter group_by mutate
#' @importFrom dplyr n_distinct pull rename rowwise select select_if
#' @importFrom dplyr slice summarise sym transmute_all ungroup
#' @importFrom ggplot2 aes aes_string coord_flip element_blank
#' @importFrom ggplot2 element_rect geom_bar geom_boxplot geom_hline
#' @importFrom ggplot2 geom_jitter geom_point geom_text ggplot
#' @importFrom ggplot2 ggtitle layer_scales scale_linetype_manual
#' @importFrom ggplot2 scale_x_continuous scale_y_continuous theme theme_linedraw
#' @importFrom ggplot2 xlab xlim ylab ylim
#' @importFrom ggthemes geom_rangeframe theme_tufte
#' @importFrom gridExtra grid.arrange
#' @importFrom magrittr %>%
#' @importFrom MASS fitdistr
#' @importFrom mice complete flux md.pattern mice quickpred
#' @importFrom psych describe
#' @importFrom R6 R6Class
#' @importFrom RWeka M5P
#' @importFrom shiny Progress
#' @importFrom stats as.formula median rnorm
#' @importFrom tibble add_row as_tibble is_tibble rownames_to_column tibble
#' @importFrom tidyr drop_na expand_grid gather_
#' @importFrom VIM aggr
#'
#' @include pguDMwR.R
#'
#' @author Sebastian Malkusch, \email{malkusch@@med.uni-frankfurt.de}
#'
#' @export
#'
pgu.imputation <- R6::R6Class("pgu.imputation",
####################
# instance variables
####################
private = list(
.imputationStatistics = "tbl_df",
.imputationSites = "tbl_df",
.one_hot_df = "tbl_df",
.imputationSiteDistribution = "matrix",
.imputationAgentAlphabet = "character",
.imputationAgent = "factor",
.nNeighbors = "integer",
.flux_df = "tbl_df",
.outflux_thr = "numeric",
.pred_frac = "numeric",
.pred_mat = "matrix",
.exclude_vec = "character",
.seed = "numeric",
.iterations = "numeric",
.amv = "ANY",
.success = "logical"
),
##################
# accessor methods
##################
active = list(
#' @field imputationStatistics
#' Returns the instance variable imputationStatistics.
#' (tibble::tibble)
imputationStatistics = function(){
return(private$.imputationStatistics)
},
#' @field imputationSites
#' Returns the instance variable imputationSites.
#' (tibble::tibble)
imputationSites = function(){
return(private$.imputationSites)
},
#' @field one_hot_df
#' Returns the positions of missings in one_hot encoding
#' (tibble::tibble)
one_hot_df = function(){
return(private$.one_hot_df)
},
#' @field imputationSiteDistribution
#' Returns the instance variable imputationSiteDistribution.
#' (matrix)
imputationSiteDistribution = function(){
return(private$.imputationSiteDistribution)
},
#' @field imputationAgentAlphabet
#' Returns the instance variable imputationagentAlphabet.
#' (character)
imputationAgentAlphabet = function(){
return(private$.imputationAgentAlphabet)
},
#' @field imputationAgent
#' Returns the instance variable imputationAgent.
#' (character)
imputationAgent = function(){
return(as.character(private$.imputationAgent))
},
#' @field setImputationAgent
#' Sets the instance variable imputationAgent.
#' (character)
setImputationAgent = function(agent = "character") {
private$.imputationAgent <- factor(agent, levels = self$imputationAgentAlphabet)
},
#' @field nNeighbors
#' Returns the instance variable nNeighbors.
#' (integer)
nNeighbors = function(){
return(private$.nNeighbors)
},
#' @field setNNeighbors
#' Sets the instance variable nNeighbors.
#' (integer)
setNNeighbors = function(value = "integer"){
private$.nNeighbors <- abs(as.integer(value))
},
#' @field flux_df
#' Returns the instance variable flux_df
#' (tibble::tibble)
flux_df = function(){
return(private$.flux_df)
},
#' @field outflux_thr
#' Returns the instance variable outflux_thr.
#' (numeric)
outflux_thr = function(){
return(private$.outflux_thr)
},
#' @field setOutflux_thr
#' Sets the instance variable outflux_thr.
#' (numeric)
setOutflux_thr = function(value = numeric){
private$.outflux_thr <- as.numeric(value)
},
#' @field pred_frac
#' Returns the instance variable pred_frac.
#' (numeric)
pred_frac = function(){
return(private$.pred_frac)
},
#' @field setPred_frac
#' Sets the instance variable pred_frac.
#' (numeric)
setPred_frac = function(value = "numeric"){
private$.pred_frac <- abs(as.numeric(value))
},
#' @field pred_mat
#' Returns the instance variable pred_mat.
#' (matrix)
pred_mat = function(){
return(private$.pred_mat)
},
#' @field exclude_vec
#' Returns the instance variable exclude_vec
#' (character)
exclude_vec = function(){
return(private$.exclude_vec)
},
#' @field seed
#' Returns the instance variable seed.
#' (numeric)
seed = function(){
return(private$.seed)
},
#' @field setSeed
#' Sets the instance variable seed.
#' (numeric)
setSeed = function(value = "numeric"){
if(value < 1){
private$.seed <- 1
}
else if (value > 100){
private$.seed <- 100
}
else{
private$.seed <- value
}
},
#' @field iterations
#' Returns the instance variable iterations.
#' (numeric)
iterations = function(){
return(private$.iterations)
},
#' @field setIterations
#' Sets the instance variable iterations.
#' (numeric)
setIterations = function(value = "numeric"){
if(value < 1){
private$.iterations <- 1
}
else if (value > 100){
private$.iterations<- 100
}
else{
private$.iterations <- value
}
},
#' @field amv
#' Returns the instance variable amv.
#' (numeric)
amv = function(){
return(private$.amv)
},
#' @field success
#' Returns the instance variable success.
#' (logical)
success = function(){
return(private$.success)
}
),
###################
# memory management
###################
public = list(
#' @description
#' Creates and returns a new `pgu.imputation` object.
#' @param seed
#' Initially sets the instance variable seed.
#' Default is 42.
#' (integer)
#' @param iterations
#' Initially sets the instance variable iterations.
#' Default is 4.
#' (integer)
#' @param imputationAgent
#' Initially sets the instance variable imputationAgent.
#' Default is "none".
#' Options are: ""none", "median", "mean", "expValue", "monteCarlo", "knn", "pmm", "cart", "randomForest", "M5P".
#' (string)
#' @param nNeighbors
#' Initially sets the instance variable nNeighbors.
#' (integer)
#' @param pred_frac
#' Initially sets the instance variable pred_frac.
#' (numeric)
#' @param outflux_thr
#' Initially sets the instance fariable outflux_thr
#' @return
#' A new `pgu.imputation` object.
#' (pguIMP::pgu.imputation)
initialize = function(seed = 42, iterations = 4, imputationAgent = "none", nNeighbors = 3, pred_frac = 1.0, outflux_thr = 0.5){
private$.imputationAgentAlphabet <- c("none", "median", "mean", "mu", "mc", "knn", "pmm", "cart", "rf", "M5P")
self$setSeed <- seed
self$setIterations <- iterations
self$setImputationAgent <- imputationAgent
self$setNNeighbors <- nNeighbors
self$setPred_frac <- pred_frac
self$setOutflux_thr <- outflux_thr
private$.success <- FALSE
private$.pred_mat <- matrix()
private$.flux_df <- tibble::tibble()
private$.exclude_vec <- character(0)
private$.one_hot_df <- tibble::tibble()
self$gatherImputationSites()
self$gatherImputationSiteStatistics()
self$gatherImputationSiteDistribution()
},#function
#' @description
#' Clears the heap and
#' indicates that instance of `pgu.imputation` is removed from heap.
finalize = function(){
print("Instance of pgu.imputation removed from heap")
},#function
##########################
# print instance variables
##########################
#' @description
#' Prints instance variables of a `pgu.imputation` object.
#' @return
#' string
print = function(){
rString <- sprintf("\npgu.imputation\n")
cat(rString)
uString <- sprintf("\nseed: %i\niterations: %i\nimputationAgent: %s\nimputationStatistics:\n", self$seed, self$iterations, as.character(self$imputationAgent))
cat(uString)
print(self$imputationStatistics)
print("imputationsites:")
print(self$imputationSites)
cat("\n\n")
invisible(self)
}, #function
####################
# public functions #
####################
#' @description
#' Gathers imputation sites from pguIMP's missings and outliers class.
#' @param missings_df
#' Dataframe comprising information about the imputation sites of pguIMP's missings class.
#' (tibble::tibble)
#' @param outliers_df
#' Dataframe comprising information about the imputation sites of pguIMP's outliers class.
#' (tibble::tibble)
gatherImputationSites = function(missings_df = "tbl_df", outliers_df = "tbl_df"){
input_correct <- TRUE
if(!tibble::is_tibble(missings_df)){
input_correct <- FALSE
missings_df <- tibble::tibble(row = integer(0),
features = character(0))
input_correct <- TRUE
}
if(!tibble::is_tibble(outliers_df)){
input_correct <- FALSE
outliers_df <- tibble::tibble(measurement = integer(0),
feature = character(0))
input_correct <- TRUE
}
if(input_correct){
missings_df <- missings_df %>%
dplyr::rename(idx = row) %>%
dplyr::rename(feature = features) %>%
dplyr::select(c("idx", "feature"))
outliers_df <- outliers_df %>%
dplyr::rename(idx = measurement) %>%
dplyr::select(c("idx", "feature"))
private$.imputationSites <- missings_df %>%
dplyr::bind_rows(outliers_df) %>%
dplyr::arrange(feature,idx)
} else{
print("Warning, pguImputation$gatherImputationsites got wrong inut format.")
private$.imputationSites <- tibble::tibble(idx = integer(0),
feature = character(0))
}
}, #function
#' @description
#' Gathers statistical information about imputation sites
#' The information is stored within the classes instance variable `imputationStatistics`
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
gatherImputationSiteStatistics = function(data_df = "tbl_df"){
if(tibble::is_tibble(data_df)){
private$.imputationStatistics <- private$.imputationSites %>%
dplyr::group_by(feature) %>%
dplyr::summarise(imputation_sites = dplyr::n_distinct(idx)) %>%
dplyr::arrange(imputation_sites)
for (name in colnames(data_df)){
if(!any(grepl(name, private$.imputationStatistics$feature))){
private$.imputationStatistics <- private$.imputationStatistics %>%
tibble::add_row(feature = !!name, imputation_sites = 0)
}#if
}#for
private$.imputationStatistics <- private$.imputationStatistics %>%
dplyr::mutate(measurements = rep(nrow(data_df), nrow(private$.imputationStatistics))) %>%
dplyr::mutate(trusted = measurements - imputation_sites) %>%
dplyr::mutate(fraction_of_sites = 100.0 * imputation_sites / measurements) %>%
dplyr::select(c("feature", "measurements", "trusted", "imputation_sites", "fraction_of_sites"))
} else{
private$.imputationStatistics <- tibble::tibble(feature = character(0),
measurements = integer(0),
trusted = integer(0),
imputation_sites = integer(0),
fraction_of_sites = numeric(0))
}
}, #function
#' @description
#' Gathers the distribution of imputation sites within the data frame.
#' The information is stored within the classes instance variable imputationSiteDistribution.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @return
#' A data frame
#' (tibble::tibble)
gatherImputationSiteDistribution = function(data_df = "tbl_df"){
if(tibble::is_tibble(data_df)){
d <- data_df %>%
as.data.frame() %>%
mice::md.pattern(plot=FALSE)
# colnames(d)[-1] <- "Sites"
colnames(d)[length(colnames(d))] <- "Sites"
rownames(d)[length(rownames(d))] <- "Sum"
private$.imputationSiteDistribution <- d
} else{
private$.imputationSiteDistribution <- matrix(0)
}
}, #function
#' @description
#' Takes a dataframe, replaces the imputation sites indicated by the instance variable `imputationsites` by NA,
#' and returns the mutated dataframe.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @return
#' A mutated version of data_df.
#' (tibble::tibble)
insertImputationSites = function(data_df = "tbl_df"){
for(name in colnames(data_df)){
temp_idx <- self$imputationSites %>%
dplyr::filter(feature == name) %>%
dplyr::select(idx) %>%
unlist() %>%
as.integer()
temp_values <- data_df %>%
dplyr::select(name) %>%
unlist() %>%
as.numeric()
temp_values[temp_idx] <- NA
data_df <- data_df %>%
dplyr::mutate(!!name := temp_values)
}#for
return(data_df)
}, #function
#' @description
#' Gathers statistical information about missing values
#' in one hot format.
#' The result is stored in the instance variable one_hot_df.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
one_hot = function(data_df = "tbl_df"){
if(!tibble::is_tibble(data_df)){
print("Warning: data_df needs to by of type tibble.")
private$.one_hot_df <- tibble::tibble()
}
private$.one_hot_df <- data_df %>%
dplyr::select_if(is.numeric) %>%
dplyr::transmute_all(list(miss = ~ as.integer(is.na(.))))
}, #function
#' @description
#' Takes a dataframe and analyses the imputation sites.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
analyzeImputationSites = function(data_df = "tbl_df"){
numeric_df <- data_df %>%
dplyr::select_if(is.numeric)
self$gatherImputationSiteStatistics(numeric_df)
numeric_df <- self$insertImputationSites(numeric_df)
self$one_hot(numeric_df)
self$gatherImputationSiteDistribution(numeric_df)
private$.amv <- VIM::aggr(numeric_df, plot=FALSE)
}, #function
#' @description
#' Returns the position of an attribute's imputation sites within a data frame.
#' @param featureName
#' The attribute's name.
#' (character)
#' @return
#' The postion of the imputation sites.
#' (numeric)
imputationSiteIdxByFeature = function(featureName = "character"){
self$imputationSites %>%
dplyr::filter(feature == featureName) %>%
dplyr::pull(idx) %>%
as.integer() %>%
return()
}, #function
#'
#' @description
#' Characterizes each row of the data frame as either `complete`
#' or indicates which attribute are missing within the row.
#' If multiple attributes' row entries are missing, the row is characterized by `multiple`.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @return
#' Vector of row characteristics.
#' (character)
nanFeatureList = function(data_df = "tbl_df"){
nanFeature <- c(rep("complete", nrow(data_df)))
if (nrow(self$imputationSites) > 0) {
for(i in seq(from = 1,to = nrow(self$imputationSites), by =1)){
if (grepl("complete", nanFeature[self$imputationSites[[i,"idx"]]])){
nanFeature[self$imputationSites[[i,"idx"]]] <- self$imputationSites[[i, "feature"]]
}#if
else{
nanFeature[self$imputationSites[[i,"idx"]]] <- "multiple"
}#else
}#for
}#if
return(nanFeature)
}, #function
#####################
# detect predictors #
#####################
#' @description
#' Calculates the average number of predictors for a given dataframe and minpuc and mincor variables
#' using the mice::quickpred routine.
#' @param data_df
#' The dataframe to be analyzed
#' (tibble::tibble)
#' @param minpuc
#' Specifies the minimum threshold for the proportion of usable cases.
#' (numeric)
#' @param mincor
#' Specifies the minimum threshold against which the absolute correlation in the dataframe is compared.
#' (numeric)
#' @return
#' Average_number_of_predictors.
#' (numeric)
average_number_of_predictors = function(data_df = "tbl_df", minpuc = 0, mincor = 0.1){
pred_dist <- data_df %>%
# dplyr::select(-dplyr::all_of(self$exclude_vec)) %>%
mice::quickpred(minpuc = minpuc, mincor = mincor, exclude = self$exclude_vec) %>%
rowSums() %>%
table()
sum(as.numeric(names(pred_dist)) * as.numeric(pred_dist)) / sum(as.numeric(pred_dist)[2:length(pred_dist)]) %>%
return()
}, #function
#' @description
#' Identifies possible predictors for each feature.
#' Analysis results are written to the instance variable pred_mat.
#' Intermediate analysis results are an influx/outflux dataframe
#' that is written to the instance variable flux_df and
#' detect predictors and a list of features that is excluded from
#' the search for possible predictors that is written to the
#' instance variable exclude_vec.
#' @param data_df
#' The dataframe to be analyzed.
#' (tibble::tibble)
detectPredictors = function(data_df = "tbl_df"){
private$.flux_df <- data_df %>%
mice::flux() %>%
tibble::rownames_to_column() %>%
tibble::as_tibble()
private$.exclude_vec <- self$flux_df %>%
dplyr::filter(outflux < self$outflux_thr) %>%
dplyr::select(c("rowname")) %>%
dplyr::pull()
quickpred_df <- tidyr::expand_grid(minpuc = seq(from=0.0, to=0.9, by=0.1),
mincor = seq(from=0.0, to=0.9, by=0.1)) %>%
dplyr::rowwise() %>%
dplyr::mutate(mean = self$average_number_of_predictors(data_df, minpuc, mincor)) %>%
dplyr::ungroup()
quickpred_para <- quickpred_df %>%
tidyr::drop_na() %>%
dplyr::slice(which.min(abs(quickpred_df$mean - trunc(self$pred_frac * ncol(data_df)) )))
minpuc <- quickpred_para$minpuc
mincor <- quickpred_para$mincor
private$.pred_mat <- data_df %>%
mice::quickpred(minpuc = minpuc, mincor = mincor, exclude = self$exclude_vec)
}, #function
###########################
# handle imputation sites #
###########################
#' @description
#' Chooses a cleaning method based upon the instance variable `imputationAgent`
#' and handles the imputation sites in the dataframe.
#' Returns a cleaned data set.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored within this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
handleImputationSites = function(data_df = "tbl_df", progress = "Progress"){
if(is.na(self$imputationAgent)){
print("Warning: Error in pgu.imputation imputationAgent is not valid. Will be set to none.")
self$setimputationAgent <- "none"
}#if
private$.success <- FALSE
data_df <- self$insertImputationSites(data_df)
cleanedData <- data_df
tryCatch({
cleanedData <- switch((self$imputationAgent),
"none" = data_df,
"median" = self$imputeByMedian(data_df, progress),
"mean" = self$imputeByMean(data_df, progress),
"mu" = self$imputeByExpectationValue(data_df, progress),
"mc" = self$imputeByMC(data_df, progress),
"knn" = self$imputeByKnn(data_df, progress),
"pmm" = self$imputeByMice(data_df, progress),
"cart" = self$imputeByMice(data_df, progress),
"rf" = self$imputeByMice(data_df, progress),
"M5P" = self$imputeByM5P(data_df, progress)
)
private$.success <- TRUE
},
error = function(e) {
private$.success <- FALSE
errorMesage <- sprintf("\nError in pgu.imputation during handleImputationSites routine:\n%s", e)
cat(errorMesage)
}#error
)#tryCatch
colnames(cleanedData) <- colnames(data_df)
return(cleanedData)
}, #function
#' @description
#' Substitutes imputation sites by the median of the respective attribute.
#' Returns the cleaned dataframe.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
imputeByMedian = function(data_df = "tbl_df", progress = "Progress"){
for (feature in self$imputationStatistics[["feature"]]){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(1.0/ncol(data_df))
}#if
indices <- self$imputationSiteIdxByFeature(feature)
data_df <- data_df %>%
dplyr::mutate(!!feature := replace(!!as.name(feature),
indices,
stats::median(!!as.name(feature), na.rm = TRUE)))
}#for
return(data_df)
}, #function
#' @description
#' Substitutes imputation sites by the aritmertic mean of the respective attribute.
#' Returns the cleaned dataframe.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
imputeByMean = function(data_df = "tbl_df", progress = "Progress"){
for (feature in self$imputationStatistics[["feature"]]){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(1.0/ncol(data_df))
}#if
indices <- self$imputationSiteIdxByFeature(feature)
data_df <- data_df %>%
dplyr::mutate(!!feature := replace(!!as.name(feature),
indices,
mean(!!as.name(feature), na.rm = TRUE)))
}#for
return(data_df)
}, #function
#' @description
#' Substitutes imputation sites by the expectation value of the respective attribute.
#' Returns the cleaned dataframe.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
imputeByExpectationValue = function(data_df = "tbl_df", progress = "Progress"){
for (feature in self$imputationStatistics[["feature"]]){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(1.0/ncol(data_df))
}#if
tryCatch({
fit_obj <- data_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.double() %>%
MASS::fitdistr("normal")
mu <- fit_obj$estimate["mean"] %>%
as.numeric()
}, error = function(e) {
error_string <- sprintf("\nWarning in pgu.imputation$imputeByExpectationValue: Could not determine expectation value of feature %s. Used mean value instead.\n", feature)
cat(error_string)
mu <- data_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.double() %>%
mean()
}
)
indices <- self$imputationSiteIdxByFeature(feature)
data_df <- data_df %>%
dplyr::mutate(!!feature := replace(!!as.name(feature),
indices,
mu))
}#for
return(data_df)
}, #function
#' @description
#' Substitutes imputation sites by values generated by a monte carlo simulation.
#' The procedure runs several times as defined by the instance variable `iterations`.
#' The run with the best result is identified and used for substitution.
#' Returns the cleaned dataframe.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
imputeByMC = function(data_df = "tbl_df", progress = "Progress"){
imputed_df <- data_df
# Calculate Errors
stats0_mat <- data_df %>%
psych::describe(na.rm=TRUE) %>%
as.matrix()
stats_mat_list = list()
diff_mat_list = list()
for (i in seq(from=1, to=self$iterations, by=1)){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(1.0/self$iterations)
}#if
imputed_df <- data_df
set.seed(self$seed + i)
for(feature in colnames(data_df)){
mu <- data_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.double() %>%
mean()
sigma <- data_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.double() %>%
sd()
indices <- self$imputationSiteIdxByFeature(feature)
mcVal <- stats::rnorm(n = length(indices),
mean = mu,
sd = sigma)
imputed_df <- imputed_df %>%
dplyr::mutate(!!feature := replace(!!as.name(feature),
indices,
mcVal))
}#for
stats_mat_list[[i]] <- imputed_df %>%
psych::describe(na.rm=TRUE) %>%
as.matrix()
diff_mat_list[[i]] <- stats0_mat - stats_mat_list[[i]]
}# for
# Calculate Ranks
cumulative_diff_df <- tibble::tibble(statistics = colnames(diff_mat_list[[1]]))
for(i in seq(1, self$iterations)){
diff_sum <- rep(0.0, times=13)
for(j in seq(1, ncol(data_df))){
diff_sum <- diff_sum + (diff_mat_list[[i]][j,])^2
}
feature_name <- sprintf("iter_%i", i)
cumulative_diff_df <- cumulative_diff_df %>%
dplyr::mutate(!!feature_name := sqrt(diff_sum))
}
ranks_mat <- cumulative_diff_df %>%
dplyr::select(-c("statistics")) %>%
apply(MARGIN = 1, FUN = function(x)rank(x, ties.method = "min"))
# determine optimal seed
seed_additive <- ranks_mat %>%
rowSums() %>%
which.min() %>%
as.integer()
# calculate optimized imputation
imputed_df <- data_df
set.seed(self$seed + seed_additive)
for(feature in colnames(data_df)){
mu <- data_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.double() %>%
mean()
sigma <- data_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.double() %>%
sd()
indices <- self$imputationSiteIdxByFeature(feature)
mcVal <- stats::rnorm(n = length(indices),
mean = mu,
sd = sigma)
imputed_df <- imputed_df %>%
dplyr::mutate(!!feature := replace(!!as.name(feature),
indices,
mcVal))
}#for
return(imputed_df)
}, #function
#' @description
#' Substitutes imputation sites by predictions of a KNN analysis of the whole dataframe.
#' Returns the cleaned dataframe.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
imputeByKnn = function(data_df = "tbl_df", progress = "Progress"){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(0.5)
}#if
if(!ncol(data_df) > 2){
e <- simpleError("The number of features needs to be larger than 2.")
stop(e)
}#if
if (nrow(data_df) < self$nNeighbors + 1){
self$setNNeighbors <- nrow(data_df) - 1
sprintf("\nWarning in pgu.imputation$imputeByKnn: nNeighbors set to: %i\n", self$nNeighbors) %>%
cat()
}#if
data_df %>%
as.data.frame() %>%
print()
data_df %>%
as.data.frame() %>%
pguIMP::knnImputation(k=self$nNeighbors,
scale = TRUE,
meth = "weighAvg",
distData = NULL
) %>%
tibble::as_tibble() %>%
return()
}, #function
#' @description
#' Substitutes imputation sites by values generated by a different methods of the mice package.
#' The procedure runs several times as defined by the instance variable `iterations`.
#' The run with the best result is identified and used for substitution.
#' Returns the cleaned dataframe.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
imputeByMice = function(data_df, progress = "Progress") {
nPred <- trunc(ncol(data_df) * self$pred_frac)
if (ncol(data_df) < nPred){
e <- simpleError("nPred needs to be smaller that the number of features.")
stop(e)
}#if
if(ncol(data_df) < 2){
e <- simpleError("The number of features needs to be larger than 2.")
stop(e)
}#if
# determine predictor matrix
self$detectPredictors(data_df)
# Calculate Errors for iterations
stats0_mat <- data_df %>%
psych::describe(na.rm=TRUE) %>%
as.matrix()
stats_mat_list = list()
diff_mat_list = list()
for (i in seq(1, self$iterations)){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(1.0/(self$iterations))
}#if
mice_model <- data_df %>%
mice::mice(method = self$imputationAgent,
pred = self$pred_mat,
seed = self$seed + i,
printFlag = FALSE)
stats_mat_list[[i]] <- mice_model %>%
mice::complete() %>%
psych::describe(na.rm=TRUE) %>%
as.matrix()
diff_mat_list[[i]] <- stats0_mat - stats_mat_list[[i]]
}
# Calculate Ranks
cumulative_diff_df <- tibble::tibble(statistics = colnames(diff_mat_list[[1]]))
for(i in seq(1, self$iterations)){
diff_sum <- rep(0.0, times=13)
for(j in seq(1, ncol(data_df))){
diff_sum <- diff_sum + (diff_mat_list[[i]][j,])^2
}
feature_name <- sprintf("iter_%i", i)
cumulative_diff_df <- cumulative_diff_df %>%
dplyr::mutate(!!feature_name := sqrt(diff_sum))
}
ranks_mat <- cumulative_diff_df %>%
dplyr::select(-c("statistics")) %>%
apply(MARGIN = 1, FUN = function(x)rank(x, ties.method = "min"))
# determine optimal seed
seed_additive <- ranks_mat %>%
rowSums() %>%
which.min() %>%
as.integer()
# impute with optimal seed and return imputed data
mice_model <- data_df %>%
mice::mice(method = self$imputationAgent,
pred = self$pred_mat,
seed = self$seed + seed_additive,
printFlag = FALSE)
mice_model %>%
mice::complete() %>%
tibble::as_tibble() %>%
return()
}, #function
#' @description
#' Substitutes imputation sites by predictions of a M5P tree trained on the whole dataframe.
#' Returns the cleaned dataframe.
#' Display the progress if shiny is loaded.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
#' @return
#' Cleaned dataframe.
#' (tibble:tibble)
imputeByM5P = function(data_df = "tbl_df", progress = "Progress"){
nPred <- trunc(ncol(data_df) * self$pred_frac)
if (ncol(data_df) < nPred){
e <- simpleError("nPred needs to be smaller that the number of features.")
stop(e)
}#if
if(ncol(data_df) < 2){
e <- simpleError("The number of features needs to be larger than 2.")
stop(e)
}#if
imputed_df <- data_df
# determine predictor matrix
self$detectPredictors(data_df)
for(feature in colnames(data_df)){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(1.0/ncol(data_df))
}
# get imputation candidates
na_idx <- data_df %>%
dplyr::pull(feature) %>%
as.numeric() %>%
is.na() %>%
which()
if((length(na_idx)<1) | length(na_idx) == nrow(data_df)){
next
}#if
# select valid predictors
predictor_idx <- self$pred_mat[feature,] %>%
as.logical()
predictor_names <- colnames(self$pred_mat)[predictor_idx]
#split in train and prediction data
train_df <- data_df %>%
dplyr::select(dplyr::all_of(c(feature, predictor_names))) %>%
dplyr::slice(-na_idx)
na_df <- data_df %>%
dplyr::select(dplyr::all_of(c(feature, predictor_names))) %>%
dplyr::slice(na_idx)
# if predictor selection fails, take all features as predictors
if(ncol(na_df) <2){
train_df <- data_df %>%
dplyr::slice(-na_idx)
na_df <- data_df %>%
dplyr::slice(na_idx)
}
m5p_model <- sprintf("%s ~ .", feature) %>%
stats::as.formula() %>%
RWeka::M5P(data=train_df)
imputed_values <- predict(m5p_model, newdata = na_df)
for (i in 1:length(na_idx)){
imputed_df[[na_idx[i], feature]] <- imputed_values[i]
}#for
}#for
return(imputed_df)
# data_col_names <- colnames(data_df)
# colnames(data_df) <- paste0("F", seq(1:ncol(data_df))) %>%
# as.character()
# imputed_df <- data_df
# for (i in 1:length(colnames(data_df))) {
# if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
# progress$inc(1.0/ncol(data_df))
# }#if
#
# na_idx <- self$imputationSiteIdxByFeature(featureName = data_col_names[i])
#
# if((length(na_idx)<1) | length(na_idx) == nrow(data_df)){
# next
# }#if
# train_df <- data_df %>%
# dplyr::slice(-na_idx)
#
# na_df <- data_df %>%
# dplyr::slice(na_idx)
#
# m5 <- colnames(data_df)[i] %>%
# paste("~.") %>%
# as.formula() %>%
# RWeka::M5P(data = train_df)
#
# na_values <- predict(m5, newdata = na_df)
#
# for (j in 1:length(na_idx)){
# imputed_df[[na_idx[j], colnames(data_df)[i]]] <- na_values[j]
# }#for
# }#for
# colnames(imputed_df) <- data_col_names
# return(imputed_df)
}, #functions
##################
# plot functions #
##################
#' @description
#' Displays the distribution of missing values in form of a heatmap.
#' @return
#' A heatmap plot.
#' (ggplot2::ggplot)
imputationSiteHeatMap = function(){
p <- plot(self$amv,
col=c('navyblue','red'),
numbers=TRUE,
sortVars=TRUE,
# labels=self$imputationStatistics[["feature"]],
cex.axis=.7,
gap=3,
main = "Histogram of imputation sites",
ylab=c("fraction","fraction"))
return(p)
}, #function
#' @description
#' Displays the distribution of an attribute values as histogram.
#' @param data_df
#' dataframe to be analyzed.
#' (tibble::tibble)
#' @param feature
#' attribute to be shown.
#' (character)
#' @return
#' A histogram.
#' (ggplot2::ggplot)
featureBarPlot = function(data_df = "tbl_df", feature = "character"){
feature <- dplyr::sym(feature)
p <- data_df %>%
ggplot2::ggplot(mapping = ggplot2::aes_string(x=feature), na.rm=TRUE) +
ggplot2::geom_bar(stat = "bin", bins = 30) +
ggplot2::ylab("counts") +
ggplot2::xlab("value") +
ggplot2::theme_linedraw() +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = "transparent"), # bg of the panel
plot.background = ggplot2::element_rect(fill = "transparent", color = NA), # bg of the plot
legend.background = ggplot2::element_rect(fill = "transparent"),
legend.key = ggplot2::element_rect(fill = "transparent")
)
return(p)
}, #function
#' @description
#' Displays the distribution of an attribute's values as box plot.
#' @param data_df
#' dataframe to be analyzed.
#' (tibble::tibble)
#' @param feature
#' attribute to be shown.
#' (character)
#' @return
#' A box plot.
#' (ggplot2::ggplot)
featureBoxPlotWithSubset = function(data_df = "tbl_df", feature = "character"){
imputation_idx <- self$imputationSites %>%
dplyr::filter(feature == !!feature) %>%
dplyr::select(idx) %>%
dplyr::pull()
data_type <- rep("regular", nrow(data_df))
data_type[imputation_idx] <- "imputed"
# nanFeature <- self$nanFeatureList(data_df)
p <- data_df %>%
dplyr::select(feature) %>%
dplyr::mutate(type = data_type ) %>%
# dplyr::mutate(nanFeature = nanFeature) %>%
tidyr::gather_(key="feature", value="measurement", feature) %>%
ggplot2::ggplot(mapping=ggplot2::aes_string(x="feature",y="measurement"), na.rm=TRUE)+
ggplot2::geom_boxplot(na.rm=TRUE, outlier.shape = NA)+
ggplot2::geom_jitter(ggplot2::aes(colour=type), na.rm=TRUE) +
ggplot2::ylab("value") +
ggplot2::xlab("feature") +
ggplot2::theme_linedraw() +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = "transparent"), # bg of the panel
plot.background = ggplot2::element_rect(fill = "transparent", color = NA), # bg of the plot
legend.background = ggplot2::element_rect(fill = "transparent"),
legend.key = ggplot2::element_rect(fill = "transparent")
)
return(p)
}, #function
#' @description
#' Displays the distribution of an attribute's values as a composition of a box plot and a histogram.
#' @param data_df
#' dataframe to be analyzed.
#' (tibble::tibble)
#' @param feature
#' attribute to be shown.
#' (character)
#' @return
#' A composite plot.
#' (ggplot2::ggplot)
featurePlot = function(data_df = "tbl_df", feature = "character"){
p1 <- self$featureBoxPlotWithSubset(data_df, feature) +
ggplot2::theme(legend.position = c(0.9, 0.9),
legend.key = ggplot2::element_blank(),
legend.background = ggplot2::element_blank())
# limits1 <- ggplot2::layer_scales(p1)$y$range$range
p2 <- self$featureBarPlot(data_df, feature)
limits <- ggplot2::layer_scales(p2)$x$range$range
# limits <- c(min(c(limits1[1], limits2[1])),
# max(c(limits1[2], limits2[2]))
# )
p1 <- p1 +
ggplot2::scale_y_continuous(limits=limits)
p2 <- p2 +
ggplot2::scale_x_continuous(position = "top") +
ggplot2::coord_flip()
# p <- gridExtra::grid.arrange(p1,p2, layout_matrix = rbind(c(1,2),c(1,2)))
p <- gridExtra::grid.arrange(p1,p2, layout_matrix = rbind(c(1,1,2),c(1,1,2)),
top = textGrob(label = sprintf("Distribution of %s", feature)))
return(p)
},#function
#' @description
#' Displays an influx/outflux plot
#' @return
#' A composite plot.
#' (ggplot2::ggplot)
fluxPlot = function(){
p <- self$flux_df %>%
ggplot2::ggplot(mapping = ggplot2::aes_string(x="influx", y="outflux", label="rowname")) +
ggplot2::geom_point()+
ggplot2::geom_text(mapping = ggplot2::aes(label=ifelse(outflux<self$outflux_thr,as.character(rowname),'')),hjust=0,vjust=0) +
ggplot2::geom_hline(mapping = ggplot2::aes(yintercept = self$outflux_thr, linetype = "threshold")) +
ggplot2::scale_linetype_manual(values = c("threshold" = "dashed")) +
ggplot2::xlim(0,1) +
ggplot2::ylim(0,1) +
ggplot2::ggtitle("Flux plot") +
ggplot2::xlab("influx") +
ggplot2::ylab("outflux") +
ggplot2::theme_linedraw() +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = "transparent"), # bg of the panel
plot.background = ggplot2::element_rect(fill = "transparent", color = NA), # bg of the plot
legend.background = ggplot2::element_rect(fill = "transparent"),
legend.key = ggplot2::element_rect(fill = "transparent")
)
# ggthemes::geom_rangeframe(size = 1, x=ggplot2::xlim(0,1), y=c(0, 1)) +
# ggthemes::theme_tufte()
# ggplot2::theme(legend.position=c(.9,.75))
return(p)
}#function
)#public
)#class
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.