# check_data.R
# ::rtemis::
# 2022 E.D. Gennatas rtemis.org
#' Check Data
#'
#' @param x data.frame, data.table or similar structure
#' @param name Character: Name of dataset
#' @param get_duplicates Logical: If TRUE, check for duplicate cases
#' @param get_na_case_pct Logical: If TRUE, calculate percent of NA values per
#' case
#' @param get_na_feature_pct Logical: If TRUE, calculate percent of NA values
#' per feature
#'
#' @author E.D. Gennatas
#' @export
#'
#' @examples
#' \dontrun{
#' n <- 1000
#' x <- rnormmat(n, 50, return.df = TRUE)
#' x$char1 <- sample(letters, n, TRUE)
#' x$char2 <- sample(letters, n, TRUE)
#' x$fct <- factor(sample(letters, n, TRUE))
#' x <- rbind(x, x[1, ])
#' x$const <- 99L
#' x[sample(nrow(x), 20), 3] <- NA
#' x[sample(nrow(x), 20), 10] <- NA
#' x$fct[30:35] <- NA
#' check_data(x)
#' }
#'
check_data <- function(x,
name = NULL,
get_duplicates = TRUE,
get_na_case_pct = FALSE,
get_na_feature_pct = FALSE) {
if (is.null(name)) name <- deparse(substitute(x))
x <- as.data.table(x)
n_rows <- NROW(x)
n_cols <- NCOL(x)
# Data Types ----
classes <- sapply(x, \(v) class(v)[1])
counts <- table(classes)
## Numeric ----
n_numeric <- max0(counts["numeric"])
## Integers ----
# index_integer <- which(sapply(x, is.integer))
# n_integer <- length(index_integer)
n_integer <- max0(counts["integer"])
## Characters ----
# index_character <- which(sapply(x, is.character))
# n_character <- length(index_character)
n_character <- max0(counts["character"])
## Factors ----
index_factor <- which(sapply(x, is.factor))
n_factor <- length(index_factor)
index_ordered <- which(sapply(x, is.ordered))
n_ordered <- length(index_ordered)
# index_gt2levels_nonordered <- which(
# sapply(
# x[, setdiff(index_factor, index_ordered), drop = FALSE],
# \(x) length(levels(x))
# ) > 2
# )
# n_gt2levels_nonordered <- length(index_gt2levels_nonordered)
## Dates ----
# index_date <- which(
# sapply(x, \(col) inherits(col, "Date"))
# )
# n_date <- length(index_date)
n_date <- sum(
max0(counts["Date"]),
max0(counts["IDate"]),
max0(counts["POSIXct"]),
max0(counts["POSIXlt"])
)
# Issues ----
## Constants ----
index_constant <- which(sapply(x, is_constant))
n_constant <- length(index_constant)
## Duplicates ----
# cindex_dups <- which(duplicated(x))
# n_dups <- length(cindex_dups)
n_dups <- if (get_duplicates) {
n_rows - uniqueN(x)
} else {
NA
}
## NAs ----
cols_anyna <- which(sapply(x, anyNA))
n_cols_anyna <- length(cols_anyna)
index_na <- which(is.na(x))
n_na <- length(index_na)
## Get percent of NA values per feature and per case
if (n_cols_anyna > 0) {
na_feature_pct <- if (get_na_feature_pct) {
data.frame(
Feature = names(cols_anyna),
Pct_NA = sapply(seq_len(n_cols_anyna), \(i) {
sum(is.na(x[[cols_anyna[1]]])) / n_cols
})
)
} else {
NULL
}
index_incomplete <- which(!complete.cases(x))
n_incomplete <- length(index_incomplete)
na_case_pct <- if (get_na_case_pct) {
data.frame(
Case = index_incomplete,
Pct_NA = sapply(seq_len(n_incomplete), \(i) {
sum(is.na(x[index_incomplete[i], ])) / n_rows
})
)
} else {
NULL
}
# Get types of features with NA
classes_na <- table(classes[cols_anyna])
} else {
classes_na <- NULL
na_feature_pct <- na_case_pct <- rep(0, n_cols)
}
# Output ----
cd <- list(
class = class(x)[1],
name = name,
n_rows = n_rows,
n_cols = n_cols,
n_numeric = n_numeric,
n_integer = n_integer,
n_character = n_character,
n_factor = n_factor,
n_ordered = n_ordered,
n_date = n_date,
n_constant = n_constant,
n_dups = n_dups,
n_cols_anyna = n_cols_anyna,
n_na = n_na,
classes_na = classes_na,
na_feature_pct = na_feature_pct,
na_case_pct = na_case_pct
)
class(cd) <- c("CheckData", "list")
cd
} # rtemis::check_data
# chck <- function(x) {
# setDT(x)
# cat("Input has", NROW(x), "rows\n")
# }
# x <- iris
# class(x)
# chck(x)
# class(x)
# x <- data.frame(
# ID = c(101L, 102L, 103L),
# V1 = rnorm(3),
# V3 = c(3L, 5L, 7L)
# )
# sapply(x, is.double)
max0 <- function(x) max(x, 0, na.rm = TRUE)
#' Generate `CheckData` object description in HTML
#'
#' @param x `CheckData` object
#' @param name Character: Name of the data set
#' @param css List: CSS styles
#'
#' @author E.D. Gennatas
#' @export
tohtml <- function(x,
name = NULL,
css = list(
font.family = "Helvetica",
color = "#fff",
background.color = "#242424"
)) {
n_rows <- x$n_rows
n_cols <- x$n_cols
n_numeric <- x$n_numeric
n_integer <- x$n_integer
n_character <- x$n_character
n_factor <- x$n_factor
n_ordered <- x$n_ordered
n_date <- x$n_date
n_constant <- x$n_constant
n_dups <- x$n_dups
n_cols_anyna <- x$n_cols_anyna
n_na <- x$n_na
classes_na <- x$classes_na
na_feature_pct <- x$na_feature_pct
na_case_pct <- x$na_case_pct
## Data Types ----
numeric <- HTML(paste(
strong(n_numeric), "numeric",
ngettext(n_numeric, "feature", "features")
))
integer <- HTML(paste(
strong(n_integer), "integer",
ngettext(n_integer, "feature", "features")
))
categorical <- HTML(paste0(
strong(n_factor),
ngettext(n_factor, " factor", " factors"),
if (n_factor == 1) {
paste(", which", ngettext(n_ordered, "is", "is not"), "ordered")
} else if (n_factor > 1) {
paste(
", of which", strong(n_ordered),
ngettext(n_ordered, "is", "are"), "ordered"
)
}
))
# .col <- if (n_character > 0) html_orange else strong
.col <- strong
characters <- HTML(paste(
.col(n_character), "character",
ngettext(n_character, "feature", "features")
))
dates <- HTML(paste(
strong(n_date), "date",
ngettext(n_date, "feature", "features")
))
## Issues ----
.col <- if (n_constant > 0) html_red else strong
constants <- HTML(paste(
.col(n_constant), "constant",
ngettext(n_constant, "feature", "features")
))
.col <- if (n_dups > 0) html_orange else strong
duplicates <- HTML(paste(
.col(n_dups), "duplicate",
ngettext(n_dups, "case", "cases")
))
.col <- if (n_cols_anyna > 0) html_orange else strong
nas <- if (n_cols_anyna > 0) {
HTML(paste(
.col(n_cols_anyna),
ngettext(n_cols_anyna, "feature includes", "features include"),
"'NA' values; ",
.col(n_na), "'NA'",
ngettext(n_na, "value", "values"),
"total",
tags$ul(
lapply(seq_along(classes_na), \(i) {
tags$li(HTML(paste(
.col(classes_na[i]),
tolower(names(classes_na)[i])
# ngettext(classes_na[i], "feature", "features")
)))
})
)
))
} else {
HTML(paste(strong("0"), "missing values"))
}
## Recs ----
rec_char <- NULL
rec_constant <- if (n_constant > 0) {
tags$li(HTML(paste(html_orange(
"Remove the constant",
ngettext(n_constant, "feature", "features")
))))
} else {
NULL
}
rec_dups <- if (n_dups > 0) {
tags$li(HTML(paste(html_orange(
"Consider removing the duplicate",
ngettext(n_dups, "case", "cases")
))))
} else {
NULL
}
# rec_na <- if (n_cols_anyna > 0) {
# tags$li(HTML(paste(html_orange("Consider imputing missing values or use complete cases only"))))
# } else {
# NULL
# }
rec_na <- if (n_cols_anyna > 0) {
list(
if (isTRUE(classes_na["factor"] > 0)) {
tags$li(HTML(paste(html_orange(
"Consider assigning factor 'NA' values to new 'missing' level"
))))
},
tags$li(HTML(paste(html_orange(
"Consider imputing missing values or using algorithms that can handle missing values"
))))
)
} else {
NULL
}
recs <- if (sum(n_constant, n_dups, n_cols_anyna) == 0) {
tags$li(html_success("Everything looks good"))
} else {
list(
rec_constant,
rec_dups,
rec_na
)
}
## out ----
div(
p(
div(
html_highlight(name),
": A", x$class, "with",
html_highlight(n_rows),
ngettext(n_rows, "row", "rows"),
"and", html_highlight(n_cols),
ngettext(n_cols, "feature", "features"),
class = "checkdata-header"
)
),
p(
span(strong("Data types"), class = "sidelined"),
tags$ul(
tags$li(numeric),
tags$li(integer),
tags$li(categorical),
tags$li(characters),
tags$li(dates)
)
), # p Data Types
p(
span(strong("Issues"), class = "sidelined"),
tags$ul(
tags$li(constants),
tags$li(duplicates),
tags$li(nas)
)
), # p Issues
p(
span(strong("Recommendations"), class = "sidelined"),
tags$ul(
recs
)
), # p Recommendations
class = "checkData",
style = paste0(
"font-family:", css$font.family,
"; color:", css$color,
"; background-color:", css$background.color, ";"
)
)
}
#' Print `CheckData` object
#'
#' @method print CheckData
#' @param x `CheckData` object.
#' @param type Character: Output type: "plaintext" or "html".
#' @param name Character: Dataset name.
#' @param check_integers Logical: If TRUE and there are integer features, prints a
#' message to consider converting to factors.
#' @param css List with `font.family`, `color`, and `background.color` elements.
#' @param ... Not used.
#'
#' @author E.D. Gennatas
#' @export
print.CheckData <- function(x,
type = c("plaintext", "html"),
name = NULL,
check_integers = FALSE,
css = list(
font.family = "Helvetica",
color = "#fff",
background.color = "#242424"
),
...) {
if (is.null(name)) {
name <- x$name
if (is.null(name)) name <- deparse(substitute(x))
}
type <- match.arg(type)
n_rows <- x$n_rows
n_cols <- x$n_cols
n_numeric <- x$n_numeric
n_integer <- x$n_integer
n_character <- x$n_character
n_factor <- x$n_factor
n_ordered <- x$n_ordered
n_date <- x$n_date
n_constant <- x$n_constant
n_dups <- x$n_dups
n_cols_anyna <- x$n_cols_anyna
n_na <- x$n_na
na_feature_pct <- x$na_feature_pct
na_case_pct <- x$na_case_pct
if (type == "plaintext") {
# plaintext out ----
out <- paste0(
" ", hilite(name),
paste(
": A", x$class, "with",
hilite(n_rows), ngettext(n_rows, "row", "rows"),
"and", hilite(n_cols),
ngettext(n_cols, "column", "columns")
)
)
## Data Types ----
out <- paste(out,
bold("\n Data types"),
paste(
" *", bold(n_numeric), "numeric",
ngettext(n_numeric, "feature", "features")
),
paste(
" *", bold(n_integer), "integer",
ngettext(n_integer, "feature", "features")
),
sep = "\n"
)
isOrdered <- if (n_factor == 1) {
paste(", which", ngettext(n_ordered, "is", "is not"), "ordered")
} else if (n_factor > 1) {
paste(", of which", bold(n_ordered), ngettext(n_ordered, "is", "are"), "ordered")
} else {
""
}
out <- paste(out,
paste0(
" * ", bold(n_factor),
ngettext(n_factor, " factor", " factors"),
isOrdered
),
sep = "\n"
)
out <- paste(out,
paste(
" *", bold(n_character), "character",
ngettext(n_character, "feature", "features")
),
sep = "\n"
)
out <- paste(out,
paste(
" *", bold(n_date), "date",
ngettext(n_date, "feature", "features")
),
sep = "\n"
)
## Issues ----
out <- paste(out,
bold("\n Issues"),
sep = "\n"
)
fmt <- ifelse(n_constant > 0, red, I)
out <- paste(out,
paste(
" *", bold(fmt(n_constant)), "constant",
ngettext(n_constant, "feature", "features")
),
sep = "\n"
)
fmt <- ifelse(n_dups > 0, orange, I)
out <- paste(out,
paste(
" *", bold(fmt(n_dups)), "duplicate",
ngettext(n_dups, "case", "cases")
),
sep = "\n"
)
nas <- if (n_cols_anyna > 0) {
classes_na <- x$classes_na
.col <- if (n_cols_anyna > 0) orange else I
paste(
bold(.col(n_cols_anyna)),
ngettext(n_cols_anyna, "feature includes", "features include"),
"'NA' values;",
bold(.col(n_na)), "'NA'",
ngettext(n_na, "value", "values"),
"total\n *",
paste0(
sapply(seq_along(classes_na), \(i) {
paste(
bold(.col(classes_na[i])),
tolower(names(classes_na)[i])
)
}),
collapse = "\n * "
)
)
} else {
paste(bold("0"), "missing values")
}
out <- paste0(out, "\n * ", nas)
## Recommendations ----
out <- paste(out,
bold("\n Recommendations"),
sep = "\n"
)
if (sum(n_character, n_constant, n_dups, n_cols_anyna) > 0) {
if (n_constant > 0) {
out <- paste(out,
bold(red(paste(
" * Remove the constant",
ngettext(n_constant, "feature", "features")
))),
sep = "\n"
)
}
if (n_dups > 0) {
out <- paste(out,
bold(orange(paste(
" * Consider removing the duplicate",
ngettext(n_dups, "case", "cases")
))),
sep = "\n"
)
}
if (n_cols_anyna > 0) {
out <- paste(out,
bold(orange(paste(
" * Consider imputing missing values or use complete cases only"
))),
sep = "\n"
)
}
if (check_integers && n_integer > 0) {
out <- paste(out,
paste0(
" * Check the",
ifelse(n_integer > 1, paste("", n_integer, ""), " "),
"integer",
ngettext(n_integer, " feature", " features"),
" and consider if",
ngettext(n_integer, " it", " they"), " should be converted to ",
ngettext(n_integer, "factor", "factors")
),
sep = "\n"
)
}
} else {
out <- paste(out,
green(" * Everything looks good", bold = TRUE),
sep = "\n"
)
}
cat(out, "\n")
} else {
# HTML ----
htmltools::html_print(
tohtml(x),
background = css$background.color
)
}
invisible(x)
} # print.CheckData
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.