#' @title pgu.missings
#'
#' @description
#' Detects and substitutes missing values from data set.
#'
#' @details
#' Detects missing values in the transformed and normalized data set.
#' This object is used by the shiny based gui and is not for use in individual R-scripts!
#'
#' @format [R6::R6Class] object.
#'
#' @importFrom dplyr any_vars bind_cols filter_all mutate select select_if transmute_all
#' @importFrom magrittr %>%
#' @importFrom mice md.pattern
#' @importFrom R6 R6Class
#' @importFrom tibble as_tibble is_tibble tibble
#' @importFrom VIM aggr
#'
#' @author Sebastian Malkusch, \email{malkusch@@med.uni-frankfurt.de}
#'
#' @export
#'
pgu.missings <- R6::R6Class("pgu.missings",
####################
# instance variables
####################
private = list(
.imputationParameter = "tbl_df",
.imputationSites = "tbl_df",
.one_hot_df = "tbl_df",
.amv = "ANY"
),
##################
# accessor methods
##################
active = list(
#' @field imputationParameter
#' Returns the instance variable outliersParameter.
#' (tibble::tibble)
imputationParameter = function(){
return(private$.imputationParameter)
},
#' @field imputationSites
#' Returns the instance variable imputationSites.
#' (tibble::tibble)
imputationSites = function(){
return(private$.imputationSites)
},
#' @field one_hot_df
#' Returns the positions of missings in one_hot encoding
#' (tibble::tibble)
one_hot_df = function(){
return(private$.one_hot_df)
},
#' @field amv
#' Returns the instance variable amv.
#' (numeric)
amv = function(){
return(private$.amv)
}
),
###################
# memory management
###################
public = list(
#' @description
#' Creates and returns a new `pgu.missings` object.
#' @param data_df
#' The data to be cleaned.
#' (tibble::tibble)
#' @return
#' A new `pgu.missings` object.
#' (pguIMP::pgu.missings)
initialize = function(data_df = "tbl_df"){
if(!tibble::is_tibble(data_df)){
data_df <- tibble::tibble(names <- "none",
values <- c(NA))
}
self$resetImputationParameter(data_df)
}, #function
#' @description
#' Clears the heap and
#' indicates that instance of `pgu.missings` is removed from heap.
finalize = function(){
print("Instance of pgu.missings removed from heap")
}, #function
##########################
# print instance variables
##########################
#' @description
#' Prints instance variables of a `pgu.missings` object.
#' @return
#' string
print = function(){
rString <- sprintf("\npgu.missings\n")
cat(rString)
print(self$imputationParameter)
print(self$imputationSites)
cat("\n\n")
invisible(self)
}, #function
####################
# public functions #
####################
#' @description
#' Resets instance variables and
#' identifies missings in the normalized data set.
#' @param data_df
#' Dataframe to be analyzed.
#' (tibble::tibble)
resetImputationParameter = function(data_df = "tbl_df"){
numericData <- data_df %>%
dplyr::select_if(is.numeric)
features <- numericData %>%
colnames()
measurements <- c(rep(0.0, length(features)))
existings <- c(rep(0.0, length(features)))
missings <- c(rep(0.0, length(features)))
fractionOfMissings <- c(rep(0.0, length(features)))
private$.imputationParameter <- tibble::tibble(features, measurements, existings, missings, fractionOfMissings)
private$.amv <- VIM::aggr(numericData, plot=FALSE)
self$gatherImputationStatistics(data_df)
self$detectImputationSites(data_df)
self$one_hot(data_df)
}, #function
####################
# helper functions #
####################
#' @description
#' Returns the position of an attribute within a data frame.
#' @param feature
#' The attribute's name.
#' (character)
#' @return
#' The postion of the attribute.
#' (numeric)
featureIdx = function(feature = "character"){
idx <- match(feature, self$imputationParameter[["features"]])
if(is.na(idx)){
rString <- sprintf("\nWarning in pgu.imputation: feature %s is not known\n",
feature)
cat(rString)
}#if
return(idx)
}, #function
#' @description
#' Selects features cotaining missing values from a dataset.
#' @param data_df
#' Dataframe to be analyzed.
#' (tibble::tibble)
#' @return
#' The filtered data frame.
#' (tibble::tibble)
filterFeatures = function(data_df = "tbl_df"){
data_df %>%
dplyr::select(private$.imputationParameter[["features"]]) %>%
return()
}, #function
#######################
# missings statistics #
#######################
#' @description
#' Calculates the number of values of a vector.
#' @param value
#' A vector comprising numeric data.
#' (numeric)
#' @return
#' The lenght of the vector.
#' (numeric)
gatherMeasurements = function(value = "numeric"){
return(length(value))
}, #function
#' @description
#' Calculates the number of missing values of a vector.
#' @param value
#' A vector comprising numeric data.
#' (numeric)
#' @return
#' The number of missing in the vector.
#' (numeric)
gatherMissings = function(value = "numeric"){
y <- sum(is.na(value))
return(y)
}, #function
#' @description
#' Calculates the number of existing values of a vector.
#' @param value
#' A vector comprising numeric data.
#' (numeric)
#' @return
#' The number of existing values in the vector.
#' (numeric)
gatherExistings = function(value = "numeric"){
y <- sum(!is.na(value))
return(y)
}, #function
#' @description
#' Calculates the fraction of missing values of a vector.
#' @param value
#' A vector comprising numeric data.
#' (numeric)
#' @return
#' The fraction of missing values in the vector.
#' (numeric)
gatherFractionOfMissings = function(value = "numeric"){
y <- 100.0*sum(is.na(value))/length(value)
return(y)
}, #function
#' @description
#' Gathers statistical information about missing values
#' that are provided by the classes public `gather` functions.
#' The information is stored within the classes instance variable `imputationParameter`
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
gatherImputationStatistics = function(data_df = "tbl_df"){
filteredData <- data_df %>%
self$filterFeatures()
private$.imputationParameter["measurements"] <- filteredData %>%
apply(MARGIN=2, FUN=self$gatherMeasurements)
private$.imputationParameter["existings"] <- filteredData %>%
apply(MARGIN=2, FUN=self$gatherExistings)
private$.imputationParameter["missings"] <- filteredData %>%
apply(MARGIN=2, FUN=self$gatherMissings)
private$.imputationParameter["fractionOfMissings"] <- filteredData %>%
apply(MARGIN=2, FUN=self$gatherFractionOfMissings)
}, #function
#' @description
#' Gathers statistical information about missing values
#' in one hot format.
#' The result is stored in the instance variable one_hot_df.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
one_hot = function(data_df = "tbl_df"){
if(!tibble::is_tibble(data_df)){
print("Warning: data_df needs to by of type tibble.")
private$.one_hot_df <- tibble::tibble()
}
private$.one_hot_df <- data_df %>%
dplyr::select_if(is.numeric) %>%
dplyr::transmute_all(list(miss = ~ as.integer(is.na(.))))
}, #function
###########################
# detect imputation sites #
###########################
#' @description
#' Detects missing values within the data frame and
#' writes the to the instance variable `imputationsites`.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
detectImputationSites = function(data_df = "tbl_df"){
private$.imputationSites <- data_df %>%
self$filterFeatures() %>%
is.na() %>%
which(arr.ind=TRUE) %>%
tibble::as_tibble() %>%
dplyr::mutate(features = self$imputationParameter[["features"]][col])
}, #function
##########
# output #
##########
#' @description
#' Numeric representation of the distribution of missing values within the data frame.
#' @param data_df
#' The data frame to be analyzed.
#' (tibble::tibble)
#' @return
#' A data frame
#' (tibble::tibble)
imputationSiteDistribution = function(data_df = "tbl_df"){
d <- data_df %>%
self$filterFeatures() %>%
as.data.frame() %>%
mice::md.pattern(plot=FALSE)
# colnames(d)[-1] <- "Sites"
colnames(d)[length(colnames(d))] <- "Sites"
rownames(d)[length(rownames(d))] <- "Sum"
return(d)
}, #function
#' #' @description
#' #' Merges the numeric attributes of the pguIMP data with its metadata.
#' #' @param data_df
#' #' The data frame to be analyzed.
#' #' (tibble::tibble)
#' #' @param metadata_df
#' #' The corresponding metadata.
#' #' (tibble::tibble)
#' #' @return
#' #' A data frame
#' #' (tibble::tibble)
#' mergeImputationSiteData = function(data_df = "tbl_df", metadata_df = "tbl_df"){
#' dfMerge <- data_df
#' if(nrow(data_df) == nrow(metadata_df)){
#' dfMerge <- dplyr::bind_cols(metadata_df, data_df)
#' }#if
#' dfMerge %>%
#' dplyr::filter_all(dplyr::any_vars(is.na(.))) %>%
#' return()
#' }, #function
##################
# plot functions #
##################
#' @description
#' Displays the distribution of missing values in form of a heatmap.
#' @return
#' A heatmap plot.
#' (ggplot2::ggplot)
imputationSiteHeatMap = function(){
p <- plot(self$amv,
col=c('navyblue','red'),
numbers=TRUE,
sortVars=TRUE,
labels=self$imputationParameter[["features"]],
cex.axis=.7,
gap=3,
main = "Missings histogram",
ylab=c("fraction","fraction")) %>%
capture.output(file = tempfile())
return(p)
} #function
)#public
)#class
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.