#' @title pgu.validator
#'
#' @description
#' Validates that the distribution is not significantly altered by the imputation process.
#' This object is used by the shiny based gui and is not for use in individual R-scripts!
#'
#' @details
#' Takes two distributions (before and after imputation).
#' Performs a Wilcoxon-Mann-Whitney U test.
#' Performs a Kolmogorow-Smirnow test.
#'
#' @format [R6::R6Class] object.
#'
#' @importFrom DataVisualizations ParetoDensityEstimation
#' @importFrom dplyr all_of filter mutate pull rename right_join select select_if
#' @importFrom e1071 kurtosis skewness
#' @importFrom ggplot2 aes aes_string element_rect geom_abline geom_boxplot geom_hline geom_jitter
#' @importFrom ggplot2 geom_line geom_point ggplot ggtitle theme theme_linedraw
#' @importFrom ggplot2 xlab ylab
#' @importFrom grid gpar textGrob
#' @importFrom gridExtra grid.arrange
#' @importFrom magrittr %>%
#' @importFrom purrr discard
#' @importFrom R6 R6Class
#' @importFrom shiny Progress
#' @importFrom stats ks.test wilcox.test
#' @importFrom tibble add_row as_tibble tibble
#' @importFrom tidyr drop_na gather
#'
#' @author Sebastian Malkusch, \email{malkusch@@med.uni-frankfurt.de}
#'
#' @export
#'
pgu.validator <- R6::R6Class("pgu.validator",
####################
# instance variables
####################
private = list(
.testStatistics_df = "tbl_df",
.centralMoments_org = "tbl_df",
.centralMoments_imp = "tbl_df",
.centralMoments_delta = "tbl_df",
.features = "character",
.seed = "integer"
), #private
##################
# accessor methods
##################
active = list(
#' @field testStatistics_df
#' Returns the instance variable `testStatistics_df`.
#' (tibble::tibble)
testStatistics_df = function(){
return(private$.testStatistics_df)
},
#' @field centralMoments_org
#' Returns the instance variable `centralMoments_org`
#' (tibble::tibble)
centralMoments_org = function(){
return(private$.centralMoments_org)
},
#' @field centralMoments_imp
#' Returns the instance variable `centralMoments_imp`
#' (tibble::tibble)
centralMoments_imp = function(){
return(private$.centralMoments_imp)
},
#' @field centralMoments_delta
#' Returns the instance variable `centralMoments_delta`
#' (tibble::tibble)
centralMoments_delta = function(){
return(private$.centralMoments_delta)
},
#' @field features
#' Returns the instance variable `features`
#' (character)
features = function(){
return(private$.features)
},
#' @field seed
#' Returns the instance variable seed
#' (integer)
seed = function(){
return(private$.seed)
},
#' @field setSeed
#' Sets the instance variable seed.
#' (numeric)
setSeed = function(value = "numeric"){
private$.seed <- value
}
),#active
###################
# memory management
###################
public = list(
#' @description
#' Creates and returns a new `pgu.validator` object.
#' @param seed
#' Set the instance variable `seed`.
#' (integer)
#' @return
#' A new `pgu.validator` object.
#' (pguIMP::pgu.validator)
initialize = function(seed = 42){
self$setSeed <- seed
self$resetValidator()
},
#' @description
#' Clears the heap and
#' indicates that instance of `pgu.validator` is removed from heap.
finalize = function(){
print("Instance of pgu.validator removed from heap")
},
##########################
# print instance variables
##########################
#' @description
#' Prints instance variables of a `pgu.validator` object.
#' @return
#' string
print = function(){
rString <- sprintf("\npgu.validator:\n")
cat(rString)
print(self$features)
print(self$testStatistics_df)
print(self$centralMoments_org)
print(self$centralMoments_imp)
print(self$centralMoments_delta)
cat("\n\n")
invisible(self)
}, #function
####################
# public functions #
####################
#' @description
#' Resets instance variables
resetValidator = function(){
private$.testStatistics_df <- tibble::tibble(feature = character(0),
d.Kolmogorow = numeric(0),
p.Kolmogorow = numeric(0),
w.Wilcoxon = numeric(0),
p.Wilcoxon = numeric(0))
private$.centralMoments_org <- tibble::tibble(feature = character(0),
m1.mean = numeric(0),
m2.variance = numeric(0),
m3.skewness = numeric(0),
m4.kurtosis = numeric(0))
private$.centralMoments_imp <- tibble::tibble(feature = character(0),
m1.mean = numeric(0),
m2.variance = numeric(0),
m3.skewness = numeric(0),
m4.kurtosis = numeric(0))
private$.centralMoments_delta <- tibble::tibble(feature = character(0),
d1.mean = numeric(0),
d2.variance = numeric(0),
d3.skewness = numeric(0),
d4.kurtosis = numeric(0))
private$.features <- character(0)
}, #resetValidator
#' @description
#' Performs a comparison between the original and the imputated distribution of a given feature
#' using a two-sided Kolmorogow-Smirnow test with simulated p-vaue distribution.
#' @param org
#' Original data to be analzed.
#' (numeric)
#' @param imp
#' Imputed data to be analyzed.
#' (numeric)
#' @param feature
#' Feature name of the analyzed distributions.
#' (character)
#' @return
#' One row dataframe comprising the test results.
#' (tibble::tibble)
kolmogorowTestFeature = function(org = "numeric", imp = "numeric", feature = "character"){
test_obj <- stats::ks.test(x = org,
y = imp,
alternative = "two.sided",
simulate.p.value=TRUE,
B=2000,
exact = FALSE)
tibble::tibble(feature = c(feature),
d.Kolmogorow = c(test_obj$statistic),
p.Kolmogorow = c(test_obj$p.value)) %>%
return()
}, #kolmogorovTestFeature
#' @description
#' Performs a comparison between the original and the imputated distribution of a given feature
#' using a two-sided Wilcoxon/Mann-Whitney test.
#' @param org
#' Original data to be analzed.
#' (numeric)
#' @param imp
#' Imputed data to be analyzed.
#' (numeric)
#' @param feature
#' Feature name of the analyzed distributions.
#' (character)
#' @return
#' One row dataframe comprising the test results.
#' (tibble::tibble)
wilcoxonTestFeature = function(org = "numeric", imp = "numeric", feature = "character"){
test_obj <- stats::wilcox.test(x = org,
y = imp,
alternative = "two.sided")
tibble::tibble(feature = c(feature),
w.Wilcoxon = c(test_obj$statistic),
p.Wilcoxon = c(test_obj$p.value)) %>%
return()
}, #wilcoxonTestFeature
#' @description
#' Estimates estimates the central moments of the given distribution.
#' @param values
#' Data to be analzed.
#' (numeric)
#' @param feature
#' Feature name of the analyzed distributions.
#' (character)
#' @return
#' One row dataframe comprising the statistics.
#' (tibble::tibble)
centralMomentsFeature = function(values = "numeric",feature = "character"){
cMoment_1 <- mean(values, na.rm = TRUE)
cMoment_2 <- sd(values, na.rm = TRUE)^2
cMoment_3 <- e1071::skewness(values, na.rm = TRUE)
cMoment_4 <- e1071::kurtosis(values, na.rm = TRUE)
tibble::tibble(feature = feature,
m1.mean = c(cMoment_1),
m2.variance = c(cMoment_2),
m3.skewness = c(cMoment_3),
m4.kurtosis = c(cMoment_4)) %>%
return()
}, #statisticsFeature
#' @description
#' Validates the feature distributions of the original and the imputated dataframe``
#' using a two-sided Kolmorogow-Smirnow test and a two-sided Wilcoxon/Mann-Whitney test.
#' The result is stored in the instance variables `testStatistics_df`and `distributionStatistics_df`.
#' Displays the progress if shiny is loaded.
#' @param org_df
#' Original dataframe to be analzed.
#' (tibble::tibble)
#' @param imp_df
#' Imputed dataframe to be analyzed.
#' (tibble::tibble)
#' @param progress
#' If shiny is loaded, the analysis' progress is stored in this instance of the shiny Progress class.
#' (shiny::Progress)
validate = function(org_df = "tbl_df", imp_df = "tbl_df", progress = "Progress"){
self$resetValidator()
set.seed(self$seed)
private$.features <- imp_df %>%
dplyr::select_if(is.numeric) %>%
colnames()
for(feature in self$features){
if(("shiny" %in% (.packages())) & (class(progress)[1] == "Progress")){
progress$inc(1.0/length(self$features))
}#if
org_temp <- org_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature)
imp_temp <- imp_df %>%
dplyr::select(feature) %>%
tidyr::drop_na() %>%
dplyr::pull(feature)
kolmogorow_df <- self$kolmogorowTestFeature(org_temp, imp_temp, feature)
wilcoxon_df <- self$wilcoxonTestFeature(org_temp, imp_temp, feature)
test_df <- kolmogorow_df %>%
dplyr::right_join(wilcoxon_df, by = "feature")
private$.testStatistics_df <- self$testStatistics_df %>%
tibble::add_row(test_df)
private$.centralMoments_org <- private$.centralMoments_org %>%
tibble::add_row(self$centralMomentsFeature(org_temp, feature))
private$.centralMoments_imp <- private$.centralMoments_imp %>%
tibble::add_row(self$centralMomentsFeature(imp_temp, feature))
}#for
private$.centralMoments_delta <- tibble::tibble(feature = self$features,
d1.mean = abs(self$centralMoments_org[["m1.mean"]] - self$centralMoments_imp[["m1.mean"]]),
d2.variance = abs(self$centralMoments_org[["m2.variance"]] - self$centralMoments_imp[["m2.variance"]]),
d3.skewness = abs(self$centralMoments_org[["m3.skewness"]] - self$centralMoments_imp[["m3.skewness"]]),
d4.kurtosis = abs(self$centralMoments_org[["m4.kurtosis"]] - self$centralMoments_imp[["m4.kurtosis"]]))
}, #validate
#
# ####################
# # output functions #
# ####################
# @description
# Receives a dataframe and plost the feature 'x' against the features 'org_pdf' and 'imp_pdf'.
# Returns the plot
# @param data_df
# dataframe to be plotted
# (tibble::tibble)
# @return
# A ggplot2 object
# (ggplot2::ggplot)
# featurePdf = function(data_df = "tbl_df"){
# title_str <- sprintf("probability density")
# p <- data_df %>%
# dplyr::select(c("x", "org_pdf", "imp_pdf")) %>%
# dplyr::rename(original = org_pdf) %>%
# dplyr::rename(imputed = imp_pdf) %>%
# tidyr::gather(key = "type", value = "pdf", -x, na.rm = TRUE) %>%
# ggplot2::ggplot() +
# ggplot2::geom_line(mapping = ggplot2::aes_string(x="x", y="pdf", color="type", linetype = "type"), size = 1.5) +
# ggplot2::ggtitle(title_str) +
# ggplot2::xlab("value")
# return(p)
# }, #featurePdf
#' @description
#' Receives a dataframe and plots the pareto density of the features 'org_pdf' and 'imp_pdf'.
#' Returns the plot
#' @param data_df
#' dataframe to be plotted
#' (tibble::tibble)
#' @return
#' A ggplot2 object
#' (ggplot2::ggplot)
featurePdf = function(data_df = "tbl_df"){
p <- data_df %>%
ggplot2::ggplot() +
ggplot2::geom_line(mapping = ggplot2::aes_string(x="value", y = "pde", color = "data", linetype = "data"), size = 1.5) +
ggplot2::ylab("pdf") +
ggplot2::theme_linedraw() +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = "transparent"), # bg of the panel
plot.background = ggplot2::element_rect(fill = "transparent", color = NA), # bg of the plot
legend.background = ggplot2::element_rect(fill = "transparent"),
legend.key = ggplot2::element_rect(fill = "transparent")
)
return(p)
}, #featurePdf
#' @description
#' Receives a dataframe and plost the feature 'x' against the features 'org_cdf' and 'imp_cdf'.
#' Returns the plot
#' @param data_df
#' dataframe to be plotted
#' (tibble::tibble)
#' @return
#' A ggplot2 object
#' (ggplot2::ggplot)
featureCdf = function(data_df = "tbl_df"){
title_str <- sprintf("cumulative density")
p <- data_df %>%
dplyr::select(c("x", "org_cdf", "imp_cdf")) %>%
dplyr::rename(original = org_cdf) %>%
dplyr::rename(imputed = imp_cdf) %>%
tidyr::gather(key = "type", value = "cdf", -x, na.rm = TRUE) %>%
ggplot2::ggplot() +
ggplot2::geom_line(mapping = ggplot2::aes_string(x="x", y="cdf", color="type", linetype = "type"), size = 1.5) +
ggplot2::ggtitle(title_str) +
ggplot2::xlab("value") +
ggplot2::theme_linedraw() +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = "transparent"), # bg of the panel
plot.background = ggplot2::element_rect(fill = "transparent", color = NA), # bg of the plot
legend.background = ggplot2::element_rect(fill = "transparent"),
legend.key = ggplot2::element_rect(fill = "transparent")
)
return(p)
}, #featureCdf
#' @description
#' Receives two numeric vectors 'org' and 'imp'. Plots the qq-plot of both vectors.
#' Returns the plot
#' @param org
#' Numric vector comprising the original data.
#' (numeric)
#' @param imp
#' Numeric vector comprising the imputed data.
#' (numeric)
#' @return
#' A ggplot2 object
#' (ggplot2::ggplot)
featureVs = function(org = "numeric", imp = "numeric"){
min_val <- min(min(org), min(imp))
max_val <- max(max(org), max(imp))
p <- tibble::as_tibble(qqplot(org, imp, plot.it=FALSE)) %>%
ggplot2::ggplot() +
ggplot2::geom_point(mapping = ggplot2::aes_string(x="x", y="y")) +
ggplot2::geom_abline(intercept = 0, slope = 1) +
ggplot2::ggtitle("Imputation scatter plot") +
ggplot2::xlab("orgiginal") +
ggplot2::ylab("imputed") +
ggplot2::theme_linedraw() +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = "transparent"), # bg of the panel
plot.background = ggplot2::element_rect(fill = "transparent", color = NA), # bg of the plot
legend.background = ggplot2::element_rect(fill = "transparent"),
legend.key = ggplot2::element_rect(fill = "transparent")
)
return(p)
}, #featureQQ
#' @description
#' Receives a dataframe and information about the lloq and uloq and retuns a boxplot
#' @param data_df
#' Dataframe to be analyzed
#' (tibble::tibble)
#' @param lloq
#' lower limit of quantification
#' (numeric)
#' @param uloq
#' upper limit of quantification
#' (numeric)
#' @param feature
#' Feature name
#' (character)
#' @return
#' A ggplot2 object
#' (ggplot2::ggplot)
featureBoxPlot = function(data_df = "tbl_df", lloq = "numeric", uloq = "numeric", feature = "character"){
p <- data_df %>%
tidyr::gather(key = !!feature, value="value", -type) %>%
ggplot2::ggplot(mapping=ggplot2::aes_string(x=feature,y="value"), na.rm=TRUE)+
ggplot2::geom_boxplot(na.rm=TRUE, outlier.shape = NA)+
ggplot2::geom_jitter(ggplot2::aes(color = type), na.rm=TRUE) +
ggplot2::geom_hline(yintercept=lloq, linetype="dashed", na.rm = TRUE) +
ggplot2::geom_hline(yintercept=uloq, linetype="dashed", na.rm = TRUE) +
ggplot2::theme_linedraw() +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = "transparent"), # bg of the panel
plot.background = ggplot2::element_rect(fill = "transparent", color = NA), # bg of the plot
legend.background = ggplot2::element_rect(fill = "transparent"),
legend.key = ggplot2::element_rect(fill = "transparent")
)
return(p)
}, #featureBoxPlot
#' @description
#' Receives two numeric dataframes 'org_df' and 'imp_df', and a feature name.
#' Creates a compund plot of the validation results for the given feature..
#' Returns the plot
#' @param org_df
#' Dataframe comprising the original data.
#' (tibble::tibble)
#' @param imp_df
#' Dataframe comprising the imputed data.
#' (tibble::tibble)
#' @param lloq
#' lower limit of quantification
#' (numeric)
#' @param uloq
#' upper limit of quantification
#' (numeric)
#' @param impIdx_df
#' dataframe comprising information about imputation sites
#' (tibble::tibble)
#' @param feature
#' Feature name.
#' (character)
#' @return
#' A ggplot2 object
#' (ggplot2::ggplot)
featurePlot = function(org_df = "tbl_df", imp_df = "tbl_df", lloq = "numeric", uloq = "numeric", impIdx_df = "tbl_df", feature = "character"){
p <- NULL
if((feature %in% colnames(org_df)) & (feature %in% colnames(imp_df)))
{
title_str <- sprintf("Imputation quality analysis of %s", feature)
org <- org_df %>%
dplyr::select(dplyr::all_of(feature)) %>%
# tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.numeric()
imp <- imp_df %>%
dplyr::select(dplyr::all_of(feature)) %>%
# tidyr::drop_na() %>%
dplyr::pull(feature) %>%
as.numeric()
org_hist <- org %>%
purrr::discard(is.na) %>%
hist(plot = FALSE)
org_min <- org_hist$breaks[1]
org_max <- org_hist$breaks[length(org_hist$breaks)]
imp_hist <- imp %>%
purrr::discard(is.na) %>%
hist(plot = FALSE)
imp_min <- imp_hist$breaks[1]
imp_max <- imp_hist$breaks[length(imp_hist$breaks)]
data_min <- min(c(org_min, imp_min))
data_max <- max(c(org_max, imp_max))
org_pdf_obj <- density(org, bw = "nrd0", adjust = 1, kernel = "gaussian",from = data_min, to = data_max, n=512, na.rm=TRUE)
imp_pdf_obj <- density(imp, bw = "nrd0", adjust = 1, kernel = "gaussian",from = data_min, to = data_max, n=512, na.rm=TRUE)
dist_df <- tibble::tibble(x = org_pdf_obj$x,
org_pdf = org_pdf_obj$y,
imp_pdf = imp_pdf_obj$y)
dist_df <- dist_df %>%
dplyr::mutate(org_cdf = cumsum(org_pdf)*abs(x[2] - x[1])) %>%
dplyr::mutate(imp_cdf = cumsum(imp_pdf)*abs(x[2] - x[1]))
pde_org <- org_df %>%
dplyr::select(dplyr::all_of(feature)) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
DataVisualizations::ParetoDensityEstimation()
pde_imp <- imp_df %>%
dplyr::select(dplyr::all_of(feature)) %>%
tidyr::drop_na() %>%
dplyr::pull(feature) %>%
DataVisualizations::ParetoDensityEstimation()
pde_df <- tibble::tibble(value = c(pde_org$kernels, pde_imp$kernels),
pde = c(pde_org$paretoDensity, pde_imp$paretoDensity),
data = c(rep("original", times = length(pde_org$kernels)), rep("imputed", times = length(pde_imp$kernels))))
box_df <- tibble::tibble(original = org,
imputed = imp)
impIdx <- impIdx_df %>%
dplyr::filter(feature == !!feature) %>%
dplyr::pull(idx) %>%
as.integer()
type_vec <- rep("original", nrow(box_df))
type_vec[impIdx] <- "imputed"
box_df <- box_df %>%
dplyr::mutate(type = type_vec)
p1 <- self$featurePdf(pde_df) +
ggplot2::theme(legend.position = "none")
p2 <- self$featureBoxPlot(data_df = box_df, lloq = lloq, uloq = uloq, feature = feature) +
ggplot2::theme(legend.position = "none")
p3 <- self$featureCdf(dist_df) +
ggplot2::theme(legend.position = "top")
p4 <- self$featureVs(org, imp)
p <- gridExtra::grid.arrange(p1,p2,p3, p4,
layout_matrix = rbind(c(1,1,2,3),c(1,1,2,4)),
top = grid::textGrob(title_str, gp = grid::gpar(fontsize=20)))
}
return(p)
} #featurePlot
)#public
)# class
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.