R/inspect.R

#' Check for common errors in multi-environment trial data
#' @description
#' `r badge('stable')`
#'
#' `inspect()` scans a `data.frame` object for errors that may affect the use
#' of functions in `metan`. By default, all variables are checked regarding
#' the class (numeric or factor), missing values, and presence of possible
#' outliers. The function will return a warning if the data looks like
#' unbalanced, has missing values or possible outliers.
#'
#' @param .data The data to be analyzed
#' @param ... The variables in `.data` to check. If no variable is
#'   informed, all the variables in `.data` are used.
#' @param plot Create a plot to show the check? Defaults to `FALSE`.
#' @param threshold Maximum number of levels allowed in a character / factor
#'   column to produce a plot. Defaults to 15.
#' @param verbose Logical argument. If `TRUE` (default) then the results
#'   for checks are shown in the console.
#'
#' @return A tibble with the following variables:
#' * **Variable** The name of variable
#' * **Class** The class of the variable
#' * **Missing** Contains missing values?
#' * **Levels** The number of levels of a factor variable
#' * **Valid_n** Number of valid n (omit NAs)
#' * **Outlier** Contains possible outliers?
#' @md
#' @importFrom GGally wrap
#' @export
#' @author Tiago Olivoto \email{tiagoolivoto@@gmail.com}
#'
#' @examples
#' \donttest{
#' library(metan)
#' inspect(data_ge)
#'
#' # Create a toy example with messy data
#' df <- data_ge2[-c(2, 30, 45, 134), c(1:5)] %>% as.data.frame()
#' df[c(1, 20, 50), 5] <- NA
#' df[40, 4] <- "2..814"
#'
#' inspect(df)
#' }
inspect <- function (.data,
                     ...,
                     plot = FALSE,
                     threshold = 15,
                     verbose = TRUE) {
  if(!missing(...)){
    .data <- select(.data, ...)
  } else{
    .data <- .data
  }
  df <-
    data.frame(
      Class = sapply(.data, class),
      Missing= sapply(.data, function(x){ifelse(any(is.na(x)), "Yes", "No")}),
      Levels = sapply(.data, function(x){ifelse(!is.numeric(x), nlevels(x), "-")}),
      Valid_n = sapply(.data, function(x){length(which(!is.na(x)))}),
      Min = sapply(.data, function(x){ifelse(is.numeric(x), round(min(x, na.rm = TRUE),2), NA)}),
      Median = sapply(.data, function(x){ifelse(is.numeric(x), round(median(x, na.rm = TRUE),2), NA)}),
      Max = sapply(.data, function(x){ifelse(is.numeric(x), round(max(x, na.rm = TRUE),2), NA)}),
      Outlier = sapply(.data, function(x){ifelse(is.numeric(x), find_outliers(x, verbose = F), NA)}),
      Text = sapply(.data, function(x){ifelse(!is.numeric(x) & !is.factor(x), find_text_in_num(x), NA)})
    ) %>%
    rownames_to_column("Variable") %>%
    as_tibble()
  lvls <- as.numeric(as.character(df[which(df[4] != "-"),][4]$Levels))
  esp_nrows <- prod(lvls[lvls!=0])
  if(verbose == TRUE){
    print(df)
    nfactors <- sum(lapply(.data, is.factor) == TRUE)
    if(esp_nrows != nrow(.data)){
      warning("Considering the levels of factors, .data should have ",
              esp_nrows, " rows, but it has ", nrow(.data),
              ". Use 'as_factor()' for coercing a variable to a factor.", call. = F)
    }
    if(any(sapply(.data, grepl, pattern = ":"))){
      warning("Using ':' in labels can result an error in some functions. Use '_' instead.", call. = FALSE)
    }
    if (nfactors < 3){
      warning("Expected three or more factor variables. The data has only ", nfactors, ".", call. = F)
    }
    if(any(df$Missing == "Yes")){
      warning("Missing values in variable(s) ",
              paste(df$Variable[c(which(df$Missing == "Yes"))], collapse = ", "), ".", call. = F)
    }
    if(any(!is.na(df$Text))){
      warning("Possible text fragments in variable(s) ",
              paste(df$Variable[c(which(!is.na(df$Text)))], collapse = ", "), ".", call. = F)
    }
    if(any(df$Outlier[!is.na(df$Outlier)] != 0)){
      warning("Possible outliers in variable(s) ",
              paste(df$Variable[c(which(df$Outlier != 0))], collapse = ", "),
              ". Use 'find_outliers()' for more details.", call. = F)
    }
    if(has_zero(.data)){
      warning("Zero values observed in variable(s) ",
              paste(names(select_cols_zero(.data, verbose = FALSE)),
                    collapse = ", "), ".", call. = FALSE)
    }
    if(nfactors >= 3 && esp_nrows == nrow(.data) && all(df$Missing == "No") && all(df$Outlier[!is.na(df$Outlier)] == 0) == TRUE && !has_zero(.data)){
      message("No issues detected while inspecting data.")
    }
  }
  if(plot == TRUE){
    for (col in names(.data)) {
      data_col <- .data[[col]]
      if (!is.numeric(data_col)) {
        level_length <- length(levels(data_col))
        if (level_length > threshold) {
          stop(
            "Column '", col, "' has more levels (", level_length, ")",
            " than the threshold (", threshold, ") allowed.\n",
            "Please remove the column or increase the 'threshold' argument. Increasing the threshold may produce long processing times",
            call. = FALSE)
        }
      }
    }
    my_smooth <- function(data, mapping, method = "lm", ...){
      ggplot(data = data, mapping = mapping) +
        geom_point(alpha = 0.65) +
        geom_smooth(method=method,
                    se = FALSE,
                    size = 0.5,
                    color = "red")
    }
    ggpair <-
      .data %>%
      ggpairs(lower = NULL,
              cardinality_threshold = threshold,
              diag = list(continuous = wrap("densityDiag",
                                            size = 0.2),
                          discrete = wrap("barDiag",
                                          color = "black",
                                          size = 0.2)),
              upper = list(continuous = my_smooth,
                           discrete = wrap("facetbar",
                                           color = "black",
                                           size = 0.2),
                           combo = wrap("box_no_facet",
                                        outlier.color = "red",
                                        outlier.alpha = 0.7,
                                        outlier.size = 0.8,
                                        size = 0.2,
                                        color = "black")))+
      theme(panel.spacing = unit(0.05, "cm"),
            panel.grid = element_blank(),
            axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.text.y = element_text(color = "black"))

    suppressMessages(suppressWarnings(print(ggpair, progress = FALSE)))
  }
  invisible(df)
}
TiagoOlivoto/metan documentation built on April 28, 2024, 10:47 a.m.