Nothing
#' Check for common errors in multi-environment trial data
#' @description
#' `r badge('stable')`
#'
#' `inspect()` scans a `data.frame` object for errors that may affect the use
#' of functions in `metan`. By default, all variables are checked regarding
#' the class (numeric or factor), missing values, and presence of possible
#' outliers. The function will return a warning if the data looks like
#' unbalanced, has missing values or possible outliers.
#'
#' @param .data The data to be analyzed
#' @param ... The variables in `.data` to check. If no variable is
#' informed, all the variables in `.data` are used.
#' @param plot Create a plot to show the check? Defaults to `FALSE`.
#' @param threshold Maximum number of levels allowed in a character / factor
#' column to produce a plot. Defaults to 15.
#' @param verbose Logical argument. If `TRUE` (default) then the results
#' for checks are shown in the console.
#'
#' @return A tibble with the following variables:
#' * **Variable** The name of variable
#' * **Class** The class of the variable
#' * **Missing** Contains missing values?
#' * **Levels** The number of levels of a factor variable
#' * **Valid_n** Number of valid n (omit NAs)
#' * **Outlier** Contains possible outliers?
#' @md
#' @importFrom GGally wrap
#' @export
#' @author Tiago Olivoto \email{tiagoolivoto@@gmail.com}
#'
#' @examples
#' \donttest{
#' library(metan)
#' inspect(data_ge)
#'
#' # Create a toy example with messy data
#' df <- data_ge2[-c(2, 30, 45, 134), c(1:5)] %>% as.data.frame()
#' df[c(1, 20, 50), 5] <- NA
#' df[40, 4] <- "2..814"
#'
#' inspect(df)
#' }
inspect <- function (.data,
...,
plot = FALSE,
threshold = 15,
verbose = TRUE) {
if(!missing(...)){
.data <- select(.data, ...)
} else{
.data <- .data
}
df <-
data.frame(
Class = sapply(.data, class),
Missing= sapply(.data, function(x){ifelse(any(is.na(x)), "Yes", "No")}),
Levels = sapply(.data, function(x){ifelse(!is.numeric(x), nlevels(x), "-")}),
Valid_n = sapply(.data, function(x){length(which(!is.na(x)))}),
Min = sapply(.data, function(x){ifelse(is.numeric(x), round(min(x, na.rm = TRUE),2), NA)}),
Median = sapply(.data, function(x){ifelse(is.numeric(x), round(median(x, na.rm = TRUE),2), NA)}),
Max = sapply(.data, function(x){ifelse(is.numeric(x), round(max(x, na.rm = TRUE),2), NA)}),
Outlier = sapply(.data, function(x){ifelse(is.numeric(x), find_outliers(x, verbose = F), NA)}),
Text = sapply(.data, function(x){ifelse(!is.numeric(x) & !is.factor(x), find_text_in_num(x), NA)})
) %>%
rownames_to_column("Variable") %>%
as_tibble()
lvls <- as.numeric(as.character(df[which(df[4] != "-"),][4]$Levels))
esp_nrows <- prod(lvls[lvls!=0])
if(verbose == TRUE){
print(df)
nfactors <- sum(lapply(.data, is.factor) == TRUE)
if(esp_nrows != nrow(.data)){
warning("Considering the levels of factors, .data should have ",
esp_nrows, " rows, but it has ", nrow(.data),
". Use 'as_factor()' for coercing a variable to a factor.", call. = F)
}
if(any(sapply(.data, grepl, pattern = ":"))){
warning("Using ':' in labels can result an error in some functions. Use '_' instead.", call. = FALSE)
}
if (nfactors < 3){
warning("Expected three or more factor variables. The data has only ", nfactors, ".", call. = F)
}
if(any(df$Missing == "Yes")){
warning("Missing values in variable(s) ",
paste(df$Variable[c(which(df$Missing == "Yes"))], collapse = ", "), ".", call. = F)
}
if(any(!is.na(df$Text))){
warning("Possible text fragments in variable(s) ",
paste(df$Variable[c(which(!is.na(df$Text)))], collapse = ", "), ".", call. = F)
}
if(any(df$Outlier[!is.na(df$Outlier)] != 0)){
warning("Possible outliers in variable(s) ",
paste(df$Variable[c(which(df$Outlier != 0))], collapse = ", "),
". Use 'find_outliers()' for more details.", call. = F)
}
if(has_zero(.data)){
warning("Zero values observed in variable(s) ",
paste(names(select_cols_zero(.data, verbose = FALSE)),
collapse = ", "), ".", call. = FALSE)
}
if(nfactors >= 3 && esp_nrows == nrow(.data) && all(df$Missing == "No") && all(df$Outlier[!is.na(df$Outlier)] == 0) == TRUE && !has_zero(.data)){
message("No issues detected while inspecting data.")
}
}
if(plot == TRUE){
for (col in names(.data)) {
data_col <- .data[[col]]
if (!is.numeric(data_col)) {
level_length <- length(levels(data_col))
if (level_length > threshold) {
stop(
"Column '", col, "' has more levels (", level_length, ")",
" than the threshold (", threshold, ") allowed.\n",
"Please remove the column or increase the 'threshold' argument. Increasing the threshold may produce long processing times",
call. = FALSE)
}
}
}
my_smooth <- function(data, mapping, method = "lm", ...){
ggplot(data = data, mapping = mapping) +
geom_point(alpha = 0.65) +
geom_smooth(method=method,
se = FALSE,
size = 0.5,
color = "red")
}
ggpair <-
.data %>%
ggpairs(lower = NULL,
cardinality_threshold = threshold,
diag = list(continuous = wrap("densityDiag",
size = 0.2),
discrete = wrap("barDiag",
color = "black",
size = 0.2)),
upper = list(continuous = my_smooth,
discrete = wrap("facetbar",
color = "black",
size = 0.2),
combo = wrap("box_no_facet",
outlier.color = "red",
outlier.alpha = 0.7,
outlier.size = 0.8,
size = 0.2,
color = "black")))+
theme(panel.spacing = unit(0.05, "cm"),
panel.grid = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(color = "black"))
suppressMessages(suppressWarnings(print(ggpair, progress = FALSE)))
}
invisible(df)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.