R/utils.R

Defines functions remove_outliers_REI add_pareto_emoa_column reshape_data replace_values checkAssumptionsForAnova debug_contr_error rFromNPAV rFromWilcoxAdjusted rFromWilcox check_homogeneity_by_group check_normality_by_group normalize stat_sum_df n_fun pathPrep na.zero not_in not_empty

Documented in add_pareto_emoa_column checkAssumptionsForAnova check_homogeneity_by_group check_normality_by_group debug_contr_error na.zero n_fun normalize not_empty not_in pathPrep remove_outliers_REI replace_values reshape_data rFromNPAV rFromWilcox rFromWilcoxAdjusted stat_sum_df

#' Ensure input is not empty
#'
#' Stops execution if x is NULL, empty, or contains only NAs.
#'
#' @param x The object to check
#' @param msg The error message to display
#' @return Invisible TRUE if valid.
#' @export
not_empty <- function(x, msg = "Input must not be empty.") {
  if (is.null(x) || length(x) == 0) {
    stop(msg, call. = FALSE)
  }

  if (is.atomic(x) && all(is.na(x))) {
    stop(msg, call. = FALSE)
  }

  invisible(TRUE)
}

#' Negate `%in%` membership
#'
#' @param x Vector of values to test.
#' @param y Vector of values to match against.
#' @return Logical vector indicating non-membership.
#' @export
not_in <- function(x, y) !(x %in% y)

#' @rdname not_in
#' @export
`%!in%` <- not_in


#' Replace NA values with zero
#'
#' @param x A vector.
#' @return A vector with NAs replaced by zeros.
#' @export
#' @examples
#' na.zero(c(NA, 1, NA, 2))
na.zero <- function(x) {
  x[is.na(x)] <- 0
  return(x)
}


#' Convert Windows paths to R-friendly format
#'
#' @param path Path to convert or the string "clipboard" to read from the clipboard.
#' @param read_fn Optional custom function to read from the clipboard.
#' @param write_fn Optional custom function to write to the clipboard.
#' @return A normalized path string.
#' @export
pathPrep <- function(path = "clipboard", read_fn = NULL, write_fn = NULL) {
  get_clip_reader <- function() {
    if (!is.null(read_fn)) {
      return(read_fn)
    }
    if (requireNamespace("clipr", quietly = TRUE) && clipr::clipr_available()) {
      return(clipr::read_clip)
    }
    if (exists("readClipboard", mode = "function")) {
      return(get("readClipboard", mode = "function"))
    }
    stop("Clipboard is not available. Provide a custom `read_fn` or a direct path.")
  }

  get_clip_writer <- function() {
    if (!is.null(write_fn)) {
      return(write_fn)
    }
    if (requireNamespace("clipr", quietly = TRUE) && clipr::clipr_available()) {
      return(clipr::write_clip)
    }
    if (exists("writeClipboard", mode = "function")) {
      return(get("writeClipboard", mode = "function"))
    }
    return(function(...) invisible(NULL))
  }

  writer <- get_clip_writer()

  y <- if (identical(path, "clipboard")) {
    reader <- get_clip_reader()
    reader()
  } else {
    path
  }

  x <- chartr("\\", "/", y)
  writer(x)
  return(x)
}

#' Build a median/size label for plot annotations
#'
#' @param x A numeric vector.
#' @return A data frame with the median and label.
#' @export
n_fun <- function(x) {
  return(data.frame(y = median(x), label = paste0("n = ", length(x))))
}


#' Generating the sum and adding a crossbar.
#'
#' @param fun function
#' @param geom geom to be shown
#' @param ... Additional arguments passed to stat_summary
#'
#' @return A \code{ggplot2} layer that can be added to a ggplot object.
#' @export
#'
#' @examples \donttest{
#'   # Simple summary function: use the mean as y, ymin, and ymax
#'   mean_fun <- function(x) {
#'     m <- mean(x, na.rm = TRUE)
#'     data.frame(y = m, ymin = m, ymax = m)
#'   }
#'
#'   ggplot2::ggplot(mtcars, ggplot2::aes(x = factor(cyl), y = mpg)) +
#'     stat_sum_df(mean_fun)
#' }
stat_sum_df <- function(fun, geom = "crossbar", ...) {
  ggplot2::stat_summary(fun.data = fun, colour = "red", geom = geom, width = 0.2, ...)
}

#' This function normalizes the values in a vector to the range \[new_min, new_max\]
#' based on their original range \[old_min, old_max\].
#'
#' @param x_vector A numeric vector that you want to normalize.
#' @param old_min The minimum value in the original scale of the data.
#' @param old_max The maximum value in the original scale of the data.
#' @param new_min The minimum value in the new scale to which you want to normalize the data.
#' @param new_max The maximum value in the new scale to which you want to normalize the data.
#' @return A numeric vector with the normalized values.
#' @export
#' @examples
#' normalize(c(1, 2, 3, 4, 5), 1, 5, 0, 1)
normalize <- function(x_vector, old_min, old_max, new_min, new_max) {
  return(new_min + ((x_vector - old_min) / (old_max - old_min)) * (new_max - new_min))
}


#' Check normality for groups
#'
#' @param data the data frame
#' @param x the x column
#' @param y the y column
#'
#' @return TRUE if all groups are normal, FALSE otherwise. For groups with
#'   n > 5000, Shapiro-Wilk is skipped and the function returns FALSE with a warning.
#' @export
check_normality_by_group <- function(data, x, y) {
  # Input validation
  if (missing(data) || missing(x) || missing(y)) stop("Missing arguments")

  # Ensure numeric
  if (!is.numeric(data[[y]])) {
    val <- as.numeric(data[[y]])
    if (all(is.na(val))) {
      return(FALSE)
    } # Non-numeric data
    data[[y]] <- val
  }

  group_sizes <- data |>
    dplyr::group_by(!!dplyr::sym(x)) |>
    dplyr::summarise(n = dplyr::n(), .groups = "drop")

  results <- data |>
    dplyr::group_by(!!dplyr::sym(x)) |>
    dplyr::summarise(
      p_value = {
        values <- stats::na.omit(!!dplyr::sym(y))
        if (length(values) >= 3 && stats::var(values, na.rm = TRUE) > 0) {
          if (length(values) > 5000) {
            sampled <- sample(values, size = 5000)
            stats::shapiro.test(sampled)$p.value
          } else {
            stats::shapiro.test(values)$p.value
          }
        } else {
          NA_real_ # Cannot test
        }
      },
      .groups = "drop"
    )

  if (any(group_sizes$n > 5000)) {
    warning("Groups with n > 5000 were tested using a random sample of 5000 observations.", call. = FALSE)
  }

  # If any group is significant (p < 0.05), data is NOT normal
  all_normal <- !any(results$p_value < 0.05, na.rm = TRUE)

  return(all_normal)
}


#' Check homogeneity of variances across groups
#'
#' @param data the data frame
#' @param x the grouping variable (column name as string)
#' @param y the dependent variable (column name as string)
#'
#' @return TRUE if Levene's test is non-significant (p >= .05), FALSE otherwise
#' @export
check_homogeneity_by_group <- function(data, x, y) {
  not_empty(data)
  not_empty(x)
  not_empty(y)

  if (!requireNamespace("rstatix", quietly = TRUE)) {
    warning("Package 'rstatix' not installed. Assuming unequal variances (var.equal = FALSE).")
    return(FALSE)
  }

  formula_string <- paste(y, "~", x)
  levene_res <- rstatix::levene_test(
    data    = data,
    formula = stats::as.formula(formula_string)
  )

  # rstatix::levene_test returns a tibble with column 'p'
  p_val <- levene_res$p[1L]

  if (is.na(p_val)) {
    return(FALSE)
  }

  return(p_val >= 0.05)
}


#' Calculation based on Rosenthal's formula (1994). N stands for the *number of measurements*.
#'
#' @param wilcoxModel the Wilcox model
#' @param N number of measurements in the experiment
#'
#' @return Invisibly returns a list with components:
#'   \itemize{
#'     \item \code{r}: effect size as a numeric scalar.
#'     \item \code{z}: corresponding z-statistic.
#'     \item \code{text}: character string that is also sent to the console.
#'   }
#' @export
#'
#' @examples
#' set.seed(1)
#' d <- data.frame(
#'   group = rep(c("A", "B"), each = 10),
#'   value = rnorm(20)
#' )
#' w <- stats::wilcox.test(value ~ group, data = d, exact = FALSE)
#' rFromWilcox(w, N = nrow(d))
rFromWilcox <- function(wilcoxModel, N) {
  not_empty(wilcoxModel)
  not_empty(N)

  z <- stats::qnorm(wilcoxModel$p.value / 2)
  r <- z / sqrt(N)

  msg <- sprintf(
    "%s Effect Size, r = %.3f, z = %.3f",
    wilcoxModel$data.name, r, z
  )
  message(msg)

  invisible(list(r = r, z = z, text = msg))
}

#' rFromWilcoxAdjusted
#'
#' @param wilcoxModel the Wilcox model
#' @param N number of measurements in the experiment
#' @param adjustFactor ad adjustment factor
#'
#' @return Invisibly returns a list with components:
#'   \itemize{
#'     \item \code{r}: adjusted effect size as a numeric scalar.
#'     \item \code{z}: adjusted z-statistic.
#'     \item \code{text}: character string that is also sent to the console.
#'   }
#' @export
#'
#' @examples \donttest{
#' set.seed(1)
#' d <- data.frame(
#'   group = rep(c("A", "B"), each = 10),
#'   value = rnorm(20)
#' )
#' w <- stats::wilcox.test(value ~ group, data = d, exact = FALSE)
#' rFromWilcoxAdjusted(w, N = nrow(d), adjustFactor = 2)
#' }
rFromWilcoxAdjusted <- function(wilcoxModel, N, adjustFactor) {
  not_empty(wilcoxModel)
  not_empty(N)
  not_empty(adjustFactor)

  z <- stats::qnorm(wilcoxModel$p.value * adjustFactor / 2)
  r <- z / sqrt(N)

  msg <- sprintf(
    "%s Effect Size, r = %.3f, z = %.3f",
    wilcoxModel$data.name, r, z
  )
  message(msg)
  invisible(list(r = r, z = z, text = msg))
}


#' Calculation based on Rosenthal's formula (1994). N stands for the *number of measurements*.
#' Necessary command:
# \newcommand{\effectsize}{\textit{r=}}
#' @param pvalue p value
#' @param N number of measurements in the experiment
#'
#' @return Invisibly returns a list with components:
#'   \itemize{
#'     \item \code{r}: effect size as a numeric scalar.
#'     \item \code{z}: corresponding z-statistic.
#'     \item \code{text}: LaTeX-formatted character string that is also sent
#'       to the console.
#'   }
#' @export
#'
#' @examples rFromNPAV(0.02, N = 180)
rFromNPAV <- function(pvalue, N) {
  not_empty(pvalue)
  not_empty(N)

  z <- qnorm(pvalue / 2)
  r <- z / sqrt(N)

  stringtowrite <- sprintf(
    "\\effectsize{%s}, Z=%s",
    format(round(r, 3), trim = TRUE, nsmall = 3),
    format(round(z, 2), trim = TRUE, nsmall = 2)
  )
  message(stringtowrite)
  invisible(list(r = r, z = z, text = stringtowrite))
}


#' Debug contrast errors in ANOVA-like models
#'
#' @param dat A data frame of predictors.
#' @param subset_vec Optional logical or numeric index vector used to subset rows before checks.
#'
#' @return A list with two elements:
#' \describe{
#'   \item{nlevels}{Integer vector giving the number of levels for each factor
#'   variable in \code{dat}.}
#'   \item{levels}{List of factor level labels for each factor variable in
#'   \code{dat}.}
#' }

#' @export
#'
#' @examples
#' \donttest{
#' dat <- data.frame(
#'   group = factor(rep(letters[1:3], each = 3)),
#'   score = rnorm(9)
#' )
#'
#' debug_contr_error(dat = dat)
#' }
debug_contr_error <- function(dat, subset_vec = NULL) {
  if (!is.null(subset_vec)) {
    ## step 0
    if (mode(subset_vec) == "logical") {
      if (length(subset_vec) != nrow(dat)) {
        stop("'logical' `subset_vec` provided but length does not match `nrow(dat)`")
      }
      subset_log_vec <- subset_vec
    } else if (mode(subset_vec) == "numeric") {
      ## check range
      ran <- range(subset_vec)
      if (ran[1] < 1 || ran[2] > nrow(dat)) {
        stop("'numeric' `subset_vec` provided but values are out of bound")
      } else {
        subset_log_vec <- logical(nrow(dat))
        subset_log_vec[as.integer(subset_vec)] <- TRUE
      }
    } else {
      stop("`subset_vec` must be either 'logical' or 'numeric'")
    }
    dat <- base::subset(dat, subset = subset_log_vec)
  } else {
    ## step 1
    dat <- stats::na.omit(dat)
  }
  if (nrow(dat) == 0L) warning("no complete cases")
  ## step 2
  var_mode <- sapply(dat, mode)
  if (any(var_mode %in% c("complex", "raw"))) stop("complex or raw not allowed!")
  var_class <- sapply(dat, class)
  if (any(var_mode[var_class == "AsIs"] %in% c("logical", "character"))) {
    stop("matrix variables with 'AsIs' class must be 'numeric'")
  }
  ind1 <- which(var_mode %in% c("logical", "character"))
  dat[ind1] <- lapply(dat[ind1], as.factor)
  ## step 3
  fctr <- which(sapply(dat, is.factor))
  if (length(fctr) == 0L) warning("no factor variables to summary")
  ind2 <- if (length(ind1) > 0L) fctr[-ind1] else fctr
  dat[ind2] <- lapply(dat[ind2], base::droplevels.factor)
  ## step 4
  lev <- lapply(dat[fctr], base::levels.default)
  nl <- lengths(lev)
  ## return
  list(nlevels = nl, levels = lev)
}


#' Check the assumptions for an ANOVA with a variable number of factors: Normality and Homogeneity of variance assumption.
#'
#' @param data the data frame
#' @param y The dependent variable for which assumptions should be checked
#' @param factors A character vector of factor names
#'
#' @return A message indicating whether to use parametric or non-parametric ANOVA
#' @export
#'
#' @examples
#' \donttest{
#' set.seed(123)
#'
#' main_df <- data.frame(
#'   tlx_mental      = rnorm(40),
#'   Video           = factor(rep(c("A", "B"), each = 20)),
#'   DriverPosition  = factor(rep(c("Left", "Right"), times = 20))
#' )
#'
#' checkAssumptionsForAnova(
#'   data    = main_df,
#'   y       = "tlx_mental",
#'   factors = c("Video", "DriverPosition")
#' )
#' }
checkAssumptionsForAnova <- function(data, y, factors) {
  # Ensure data and variables are not empty
  not_empty(data)
  not_empty(y)
  not_empty(factors)

  if (!requireNamespace("rstatix", quietly = TRUE)) {
    stop("Package 'rstatix' is required for checkAssumptionsForAnova(). Please install it.")
  }

  emit_guidance <- function(text) {
    message(text)
    invisible(text)
  }

  extract_p_value <- function(test_result) {
    if ("p" %in% names(test_result)) {
      return(test_result$p)
    }
    if ("p.value" %in% names(test_result)) {
      return(test_result$p.value)
    }
    NA_real_
  }

  # Dynamically construct the formula based on the number of factors
  formula_string <- paste(y, "~", paste(factors, collapse = " * "))
  model <- lm(as.formula(formula_string), data = data)

  # Shapiro-Wilk test of normality on model residuals
  model_results <- rstatix::shapiro_test(stats::residuals(model))
  model_p <- extract_p_value(model_results)
  if (!is.na(model_p) && model_p < 0.05) {
    return(emit_guidance("You must take the non-parametric ANOVA as model is non-normal."))
  }

  # Check normality for each group
  test <- data |>
    dplyr::group_by(dplyr::across(dplyr::all_of(factors))) |>
    rstatix::shapiro_test(!!rlang::sym(y))

  # Check if the normality assumption holds (p > 0.05 for all groups)
  test_p <- extract_p_value(test)
  if (all(is.na(test_p)) || min(test_p, na.rm = TRUE) <= 0.05) {
    return(emit_guidance("You must take the non-parametric ANOVA as normality assumption by groups is violated (one or more p < 0.05)."))
  }

  # Homogeneity of variance assumption using Levene's Test
  levene_formula <- as.formula(paste(y, "~", paste(factors, collapse = " * ")))
  levene_test_result <- rstatix::levene_test(data, levene_formula)
  levene_p <- extract_p_value(levene_test_result)

  if (!is.na(levene_p) && levene_p < 0.05) {
    return(emit_guidance("You must take the non-parametric ANOVA as Levene's test is significant (p < 0.05)."))
  }

  emit_guidance("You may take parametric ANOVA (function anova_test). See https://www.datanovia.com/en/lessons/anova-in-r/#check-assumptions-1 for more information.")
}


#' Replace values across a data frame
#'
#' @description
#' Replace all occurrences of given values in all columns of a data frame.
#'
#' @name data the data frame
#' @description The `data` data frame contains a collection of records, with attributes organized in columns. It may include various types of values, such as numerical, categorical, or textual data.
#' @param data The input data frame to be modified.
#' @param to_replace A vector of values to be replaced within the data frame. This must be the same length as `replace_with`.
#' @param replace_with A vector of corresponding replacement values. This must be the same length as `to_replace`.
#'
#' @return Modified data frame with specified values replaced.
#' @export
#'
#' @examples
#' \donttest{
#' data <- data.frame(
#'   q1 = c("neg2", "neg1", "0"),
#'   q2 = c("1", "neg2", "neg1")
#' )
#'
#' replace_values(
#'   data,
#'   to_replace = c("neg2", "neg1"),
#'   replace_with = c("-2", "-1")
#' )
#' }
replace_values <- function(data, to_replace, replace_with) {
  if (length(to_replace) != length(replace_with)) {
    stop("Length of 'to_replace' and 'replace_with' must be the same.")
  }

  # Create a named vector for replacements
  replace_map <- setNames(replace_with, to_replace)

  # Apply replacements column-wise
  data[] <- lapply(data, function(column) {
    # Convert factors to characters and restore factor levels after replacement
    if (is.factor(column)) {
      column_chr <- as.character(column)
      replaced <- ifelse(!is.na(column_chr) & column_chr %in% names(replace_map),
        replace_map[column_chr],
        column_chr
      )
      new_levels <- unique(c(levels(column), replace_with))
      return(factor(replaced, levels = new_levels))
    }

    # Replace values for character columns
    if (is.character(column)) {
      return(ifelse(!is.na(column) & column %in% names(replace_map),
        replace_map[column],
        column
      ))
    }

    # Replace values for logical/numeric columns only if replacements are compatible
    column_chr <- as.character(column)
    replaced_chr <- ifelse(!is.na(column_chr) & column_chr %in% names(replace_map),
      replace_map[column_chr],
      column_chr
    )

    if (is.logical(column)) {
      coerced <- as.logical(replaced_chr)
      if (any(is.na(coerced) & !is.na(replaced_chr))) {
        stop("Replacement values are incompatible with logical columns.")
      }
      return(coerced)
    }

    if (is.numeric(column) || is.integer(column)) {
      coerced <- suppressWarnings(as.numeric(replaced_chr))
      if (any(is.na(coerced) & !is.na(replaced_chr))) {
        stop("Replacement values are incompatible with numeric columns.")
      }
      if (is.integer(column)) {
        return(as.integer(coerced))
      }
      return(coerced)
    }

    column
  })

  return(data)
}


#' Reshape Excel Data Based on Custom Markers and Include Custom ID Column
#'
#' This function takes an Excel file with data in a wide format and transforms it to a long format.
#' It includes a customizable "ID" column in the first position and repeats it for each slice.
#' The function identifies sections of columns between markers that start with a user-defined string (default is "videoinfo")
#' and appends those sections under the first section, aligning by column index.
#'
#' Relevant if you receive data in wide-format but cannot use built-in functionality due to naming (e.g., in LimeSurvey)
#'
#' @param input_filepath String, the file path of the input Excel file.
#' @param sheetName String, the name of the sheet to read from the Excel file. Default is "Results".
#' @param marker String, the string that identifies the start of a new section of columns. Default is "videoinfo".
#' @param id_col String, the name of the column to use as the ID column. Default is "ID".
#' @param output_filepath String, the file path for the output Excel file.
#'
#' @return None, writes the reshaped data to an Excel file specified by output_filepath.
#' @export
#'
#' @examples
#' \donttest{
#' if (requireNamespace(c("write_xlsx", "readxl"), quietly = TRUE)) {
#'   tmp_in  <- tempfile(fileext = ".xlsx")
#'   tmp_out <- tempfile(fileext = ".xlsx")
#'
#'   # Minimal toy input that includes your required pieces:
#'   # an ID column and something that contains the marker value.
#'   toy <- data.frame(
#'     ID = c(1, 1, 2, 2),
#'     section = c("videoinfo", "videoinfo", "videoinfo", "videoinfo"),
#'     key = c("fps", "duration_s", "fps", "duration_s"),
#'     value = c(30, 12.3, 25, 9.8),
#'     stringsAsFactors = FALSE
#'   )
#'
#'   writexl::write_xlsx(toy, tmp_in)
#'
#'   reshape_data(
#'     input_filepath = tmp_in,
#'     marker = "videoinfo",
#'     id_col = "ID",
#'     output_filepath = tmp_out
#'   )
#'
#'   out <- readxl::read_excel(tmp_out)
#'   print(out)
#' }
#' }
#'
#' @importFrom dplyr select bind_rows bind_cols
#' @importFrom readxl read_excel
#' @importFrom writexl write_xlsx
reshape_data <- function(input_filepath, sheetName = "Results", marker = "videoinfo", id_col = "ID", output_filepath) {
  # Read the Excel file into a data frame. If the requested sheet is missing,
  # fall back to the first available sheet to keep the helper robust for
  # single-sheet workbooks created on the fly (e.g., in tests).
  available_sheets <- readxl::excel_sheets(input_filepath)
  sheet_to_read <- if (sheetName %in% available_sheets) sheetName else available_sheets[[1]]
  df <- readxl::read_excel(input_filepath, sheet = sheet_to_read)

  # Initialize an empty vector to store the current columns for each marker section
  current_columns <- c()

  # Extract the custom "ID" column
  id_column <- df |> dplyr::select(dplyr::all_of(id_col))

  # Loop through each column (excluding ID) to identify given markers and reshape data accordingly
  slices <- list()
  data_columns <- setdiff(names(df), id_col)
  for (col in data_columns) {
    if (startsWith(col, marker)) {
      if (length(current_columns) > 0) {
        sliced_df <- df |> dplyr::select(dplyr::all_of(current_columns))
        slices[[length(slices) + 1]] <- dplyr::bind_cols(id_column, sliced_df)
      }
      current_columns <- c()
    } else {
      current_columns <- c(current_columns, col)
    }
  }

  if (length(current_columns) > 0) {
    sliced_df <- df |> dplyr::select(dplyr::all_of(current_columns))
    slices[[length(slices) + 1]] <- dplyr::bind_cols(id_column, sliced_df)
  }

  if (length(slices) == 0) {
    long_df <- dplyr::bind_cols(id_column, df |> dplyr::select(-dplyr::all_of(id_col)))
  } else {
    base_names <- names(slices[[1]])
    slices <- lapply(slices, function(slice) {
      names(slice) <- base_names
      slice
    })
    long_df <- dplyr::bind_rows(slices, .id = NULL)
  }

  # Check if file exists and modify output_filepath to avoid overwriting
  counter <- 1
  new_output_filepath <- output_filepath
  while (file.exists(new_output_filepath)) {
    new_output_filepath <- paste0(gsub("\\.xlsx$", "", output_filepath), "_", counter, ".xlsx")
    counter <- counter + 1
  }

  # Write the long-form data frame to a new Excel file
  writexl::write_xlsx(long_df, new_output_filepath)
}


#' Add Pareto EMOA Column to a Data Frame
#'
#' This function calculates the Pareto front for a given set of objectives in a data frame and adds a new column, `PARETO_EMOA`, which indicates whether each row in the data frame belongs to the Pareto front.
#'
#' @param data A data frame containing the data, including the objective columns.
#' @param objectives A character vector specifying the names of the objective columns in `data`. These columns should be numeric and will be used to calculate the Pareto front.
#'
#' @return A data frame with the same columns as `data`, along with an additional column, `PARETO_EMOA`, which is `TRUE` for rows that are on the Pareto front and `FALSE` otherwise.
#' @export
#'
#' @examples
#' # Define objective columns
#' objectives <- c("trust", "predictability", "perceivedSafety", "Comfort")
#'
#' # Example data frame
#' main_df <- data.frame(
#'   trust = runif(10),
#'   predictability = runif(10),
#'   perceivedSafety = runif(10),
#'   Comfort = runif(10)
#' )
#'
#' # Add the Pareto front column
#' main_df <- add_pareto_emoa_column(data = main_df, objectives)
#' head(main_df)
add_pareto_emoa_column <- function(data, objectives) {
  if (!requireNamespace("emoa", quietly = TRUE)) {
    stop("Package 'emoa' is required for add_pareto_emoa_column(). Please install it.")
  }

  # Input checks
  not_empty(data)
  not_empty(objectives)

  # Select only the objective columns
  objective_data <- data |> dplyr::select(dplyr::all_of(objectives))

  # If there's only one row, mark it as PARETO_EMOA directly
  if (nrow(objective_data) == 1) {
    data$PARETO_EMOA <- TRUE
    return(data)
  }

  # Transpose and convert to matrix as required by the nondominated_points function
  pareto_points <- emoa::nondominated_points(t(as.matrix(objective_data)))

  # Convert the Pareto points matrix back to a data frame for comparison
  pareto_df <- as.data.frame(t(pareto_points))

  # Initialize the PARETO_EMOA column as FALSE
  data$PARETO_EMOA <- FALSE

  # Mark TRUE for rows in the original data that match any row in the Pareto front
  for (i in 1:nrow(pareto_df)) {
    matching_row <- which(
      apply(objective_data, 1, function(x) all(x == pareto_df[i, ]))
    )
    if (length(matching_row) > 0) {
      data$PARETO_EMOA[matching_row] <- TRUE
    }
  }

  # Return the updated data frame
  return(data)
}


#' Remove outliers and calculate REI
#'
#' This function takes a data frame, optional header information, variables to consider,
#' and a range for a Likert scale. It then calculates the Response Entropy Index (REI)
#' and flags suspicious entries based on percentiles.
#'
#' For more information on the REI method, refer to:
#' [Response Entropy Index Method](https://ojs.ub.uni-konstanz.de/srm/article/view/7832)
#'
#' @param df Data frame containing the data.
#' @param header Logical indicating if the data frame has a header. Defaults to FALSE.
#' @param variables Character string specifying which variables to consider, separated by commas.
#' @param range Numeric vector specifying the range of the Likert scale. Defaults to c(1, 5).
#'
#' @return A data frame with calculated REI, percentile, and a 'Suspicious' flag.
#' @export
#'
#' @examples
#' \donttest{
#' df <- data.frame(var1 = c(1, 2, 3), var2 = c(2, 3, 4))
#' result <- remove_outliers_REI(df, TRUE, "var1,var2", c(1, 5))
#' }
remove_outliers_REI <- function(df, header = FALSE, variables = "", range = c(1, 5)) {
  # Validate and parse variables
  if (variables == "" && header == TRUE) {
    stop("Please input variables to consider!")
  }
  iniVariables <- stringr::str_split(variables, ",")
  variableNames <- unique(trimws(iniVariables[[1]]))

  # Initialize data frame for REI calculation
  testDF <- data.frame(REI = numeric(nrow(df)))

  # Extract specified columns
  if (header == FALSE) {
    testDF <- cbind(testDF, df)
  } else {
    for (i in variableNames) {
      columnMatches <- grep(paste("^", i, "$", sep = ""), colnames(df))
      if (length(columnMatches) > 0) {
        testDF <- cbind(testDF, df[, columnMatches])
      }
    }
  }

  # Check column count for validity
  if (NCOL(testDF) <= 2) {
    stop("Not enough columns found with the given phrase.")
  }

  # Calculate REI and related metrics
  numLevels <- range[2] - range[1] + 1
  numQuestions <- ncol(testDF) - 1
  getResponses <- function(df) {
    recordedResponses <- unique(as.vector(as.matrix(df)))
    tallies <- sapply(recordedResponses, function(x) rowSums(df == x))
    return(tallies)
  }

  tallies <- getResponses(testDF[, -1])
  proportions <- tallies / numQuestions
  logs <- proportions * log10(proportions)
  logs[is.na(logs)] <- 0
  testDF[, "REI"] <- rowSums(logs, na.rm = TRUE) * -1

  # Calculate percentile and flag suspicious entries
  testDF$Percentile <- round(stats::pnorm(testDF$REI, mean = mean(testDF$REI, na.rm = TRUE), sd = stats::sd(testDF$REI, na.rm = TRUE)), digits = 2) * 100
  testDF$Suspicious <- "No"
  testDF$Suspicious[testDF$Percentile <= 10 | testDF$Percentile >= 90] <- "Maybe"
  testDF$Suspicious[testDF$Percentile <= 5 | testDF$Percentile >= 95] <- "Yes"

  return(testDF)
}

Try the colleyRstats package in your browser

Any scripts or data that you put into this service are public.

colleyRstats documentation built on May 3, 2026, 5:07 p.m.