R/summary_function.R

Defines functions na_drop na_check data_summary

Documented in data_summary na_check na_drop

#' @title Summary
#' @description Short summary of the data frame, 
#'     including the columns names and number of rows.
#' @param df (`list`)\cr Data frame containing the performane measure. 
#' @return (`list`)\cr A vector containing the columns names and number of rows.
#' @export
data_summary <- function(df) {
  rows <- nrow(df)
  columns <- colnames(df)
  return(list(Rows = rows, Columns = columns))
}


#' @title NA Check
#' @description 
#'     Check if the measure column is complete. 
#'     For the problem sets and all algorithms present in the data frame, this 
#'     function specifies the ratio of existing NAs. If there are any NAs the 
#'     User can decide to drop all observations for that specific value, since 
#'     the data frame needs to be complete for testing. 
#' @param df (`list`)\cr Data frame containing the performane measure. 
#' @param measure (`character`)\cr Name of the 'measure' column. If not 
#'     defined, the first 'measure' column in the data frame is used.
#' @param check_var (`character`)\cr Column in data frame used to check for NAs. 
#'     Either "problem" (default) or "algorithm". 
#' @return (`list`)\cr List of Cases, NAs and the NA ratio according to 
#'     check_var. 
#' @export 
na_check <- function(df, measure = NULL, check_var = NULL){
  result <- data.frame()
  if (is.null(measure)) {
    measure <- get_measure_columns(df)[1]
  }
  if (is.null(check_var)) {
    check_var <- "problem"
  }
  if (any(is.na(df))) {
    values <- unique(df[, check_var])
    for (i in as.character(values)) {
      value_data <- subset(df, df[, check_var] == i)
      result[i, "na_number"] <- sum(is.na(value_data[, measure]))
      result[i, "observations"] <- length(which(df[, check_var] == i))
      result[i, "na_ratio"] <- 
        (result[i, "na_number"]/result[i, "observations"])
    }
  } else {
    result <- "data complete"
  }
  return(result)
}


#' @title Drop NAs by groups 
#' @description 
#'     Drop group of rows that contain any NA depending on values of check_var. 
#' @param df (`list`)\cr Data frame containing the performane measure. 
#' @param measure (`character`)\cr Name of the 'measure' column. If not 
#'     defined, the first 'measure' column in the data frame is used.
#' @param check_var (`character`)\cr Column in data frame used to check for NAs. 
#'     Either "problem" (default) or "algorithm". 
#' @return (`list`)\cr New data frame without NAs. 
#' @export 
na_drop <- function(df, check_var = NULL, measure = NULL) {
  if (is.null(measure)) {
    measure <- get_measure_columns(df)[1]
  }
  if (is.null(check_var)) {
    check_var <- "problem"
  }
  df[!(df[, check_var] %in% df[, check_var][is.na(df[, measure])]), ]
}
RebeccaGroh/seqbtests documentation built on Nov. 17, 2021, 8:50 a.m.