#' Run data check pipeline to seek for potential problems with the data
#'
#' @param data A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param y A string that indicates a target column name for regression or classification.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param time A string that indicates a time column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param status A string that indicates a status column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param type A character, one of `binary_clf`/`regression`/`survival`/`auto`/`multiclass` that
#' sets the type of the task. If `auto` (the default option) then
#' the function will figure out `type` based on the number of unique values
#' in the `y` variable, or the presence of time/status columns.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#' @param check_correlation A logical value, if set to TRUE, provides information about
#' the correlations between numeric, and categorical pairs of variables.
#' Available only when verbose is set to TRUE. Default value is TRUE.
#'
#' @return A list with two vectors: lines of the report (str) and the outliers (outliers).
#' @export
#'
#' @examples
#' check_data(lisbon, 'Price')
#' @importFrom stats IQR cor median sd
#' @importFrom utils capture.output
check_data <- function(data, y = NULL, time = NULL, status = NULL, type = 'auto', verbose = TRUE, check_correlation = TRUE) {
options(warn = -1)
if (is.null(y)) {
if (is.null(time) | is.null(status)) {
verbose_cat(crayon::red('\u2716'), 'Lack of target variables. Please specify',
'either y (for classification or regression tasks), or time and',
'status (for survival analysis). \n\n', verbose = verbose)
stop('Lack of target variables. Please specify either y (for classification
or regression tasks), or time and status (for survival analysis)')
}
} else {
if (!is.null(time) | !is.null(status)) {
verbose_cat(crayon::red('\u2716'), 'Provided too many targets. Please specify',
'either y (for classification or regression tasks), or time and',
'status (for survival analysis). \n\n', verbose = verbose)
stop('Provided too many targets. Please specify either y (for classification
or regression tasks), or time and status (for survival analysis).')
}
}
if (type == 'auto') {
type <- guess_type(data, y)
if (type == 'regression') {
data[[y]] <- as.numeric(data[[y]])
}
verbose_cat(crayon::green('\u2714'), 'Type guessed as:', type, '\n\n', verbose = verbose)
} else if (!type %in% c('regression', 'binary_clf', 'survival', 'multiclass')) {
verbose_cat(crayon::red('\u2716'), 'Invalid value. Correct task types are: `binary_clf`, `regression`, `survival`, `multiclass`, and `auto` for automatic task identification \n\n', verbose = verbose)
stop('Invalid value. Correct task types are: `binary_clf`, `regression`, `survival`, `multiclass`, and `auto` for automatic task identification')
} else {
verbose_cat(crayon::green('\u2714'), 'Type provided as:', type, '\n\n', verbose = verbose)
}
if (type == 'survial') {
if (!status %in% colnames(data) || !time %in% colnames(data)) {
verbose_cat(crayon::red('\u2716'), 'Provided target column name for time or status parameters',
status, time, 'is not present in the datataset. \n\n', verbose = verbose)
stop('Provided target column name for time or status parameter is not present in the datataset.')
}
} else if (!y %in% colnames(data)) {
verbose_cat(crayon::red('\u2716'), 'Provided target column name for y parameter', y,
'is not present in the datataset. \n\n', verbose = verbose)
stop('Provided target column name for y parameter is not present in the datataset.')
}
df <- as.data.frame(data)
str <- capture.output(cat(' -------------------- **CHECK DATA REPORT** -------------------- \n \n'))
verbose_cat(' -------------------- CHECK DATA REPORT -------------------- \n \n', verbose = verbose)
str <- c(str, basic_info(df, y, time, status, verbose))
str <- c(str, check_static(df, verbose))
str <- c(str, check_duplicate_col(df, verbose))
str <- c(str, check_missing(df, y, time, status, verbose))
df <- manage_missing(df, y)
str <- c(str, check_dim(df, verbose))
if (check_correlation) {
str <- c(str, check_cor(df, y, time, status, verbose)$str)
}
rtr <- check_outliers(df, verbose)
str <- c(str, rtr$str)
str <- c(str, check_y_balance(df, y, time, status, type, verbose))
str <- c(str, detect_id_columns(df, verbose))
verbose_cat(' -------------------- CHECK DATA REPORT END -------------------- \n \n', verbose = verbose)
str <- c(str,
capture.output(cat(' -------------------- **CHECK DATA REPORT END** -------------------- \n \n')))
ret <- list(
str = str,
outliers = rtr$outliers
)
return(ret)
}
#' Print the provided cat-like input if verbose is TRUE
#'
#' @param ... R objects - strings, (see `cat` documentation).
#' @param sep A character vector of strings to append after each element.
#' @param verbose A logical value indicating whether we want to print the string
#' or not.
#'
#' @export
verbose_cat <- function(..., sep = ' ', verbose = TRUE) {
if (verbose) {
cat(..., sep = sep)
}
}
#' Guess task type by the target value from the dataset
#'
#' For now, supported types are binary classification and regression.
#' Multilabel classification is planned to be added later on.
#'
#' @param data A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param y A string that indicates a target column name.
#' @param max_unique An integer describing the maximal number of unique
#' values in `y`.
#'
#' @return A string describing the type of ml task: `binary_clf`, `multiclass`,
#' `regression`, or `survival`.
#' @export
guess_type <- function(data, y, max_unique = 15) {
if (is.null(y)) {
type <- 'survival'
} else {
target <- data[[y]]
if (is.numeric(target)) {
if (length(unique(target)) == 2) {
type <- 'binary_clf'
} else if (length(unique(target)) <= max_unique) {
type <- 'multiclass'
} else {
type <- 'regression'
}
} else if (length(unique(target)) == 2) {
type <- 'binary_clf'
} else if (length(unique(target)) <= max_unique) {
type <- 'multiclass'
} else {
data[[y]] <- as.numeric(data[[y]])
type <- 'regression'
}
}
return(type)
}
#' Provide basic dataset information
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param y A string that indicates a target column name for regression or classification.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param time A string that indicates a time column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param status A string that indicates a status column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
basic_info <- function(df, y = NULL, time = NULL, status = NULL, verbose = TRUE) {
# Distinction between survival analysis and other tasks.
if (!is.null(y)) {
target <- paste0('a column ', y, '.')
target2 <- paste0('a column** ', y, '.')
} else {
target <- paste0('columns ', time, ' for time and ', status, ' for status.')
target2 <- paste0('columns** ', time, ' for time and ', status, ' for status.')
}
verbose_cat('The dataset has ', nrow(df), ' observations and ', ncol(df),
' columns, which names are: \n', paste0(colnames(df), sep='; '),
'\n\nWith the target described by ', target, '\n\n', sep = '', verbose = verbose)
str <- capture.output(cat('**The dataset has ', nrow(df), ' observations and ', ncol(df),
' columns which names are: **\n\n', paste0(colnames(df), sep='; '),
'\n\n **With the target described by** ', target, '\n \n', sep = ''))
return(str)
}
#' Search for columns dominated by a single value
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
check_static <- function(df, verbose = TRUE) {
dominator <- FALSE
dominator_vals <- c()
dominator_cols <- c()
for (i in 1:ncol(df)) {
if (any(table(df[, i]) / nrow(df) > 0.99)) {
is_dominating <- (table(df[, i]) / nrow(df) > 0.99)
for (j in 1:length(is_dominating)) {
if (is_dominating[j]) {
dominator <- TRUE
dominator_vals <- c(dominator_vals, names(is_dominating)[j], '; ')
dominator_cols <- c(dominator_cols, names(df)[i], '; ')
}
}
}
}
if (!dominator) {
verbose_cat(crayon::green('\u2714'), 'No static columns. \n', verbose = verbose)
str <- capture.output(cat('**No static columns. **\n\n'))
} else {
verbose_cat(crayon::red('\u2716'), ' Static columns are: \n ', dominator_cols, '\n\n', sep = '', verbose = verbose)
verbose_cat(crayon::red('\u2716'), ' With dominating values: \n', ' ', dominator_vals, '\n ', sep = '', verbose = verbose)
str <- capture.output(cat('**Static columns are: **', dominator_cols, '\n\n', sep = ''))
str <- c(str, capture.output(cat('**With dominating values: **', dominator_vals,
'\n\n', sep = '')))
}
verbose_cat('\n', verbose = verbose)
str <- c(str, capture.output(cat('\n')))
return(str)
}
#' Search for duplicates between columns
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
check_duplicate_col <- function(df, verbose = TRUE) {
pairs <- c()
for (i in 1:ncol(df)) {
for (j in i:ncol(df)) {
if (i != j && identical(df[, i], df[, j])) {
pairs <- c(pairs, c(colnames(df[i]), ' - ', colnames(df[j]), '; '))
}
}
}
if (length(pairs) == 0) {
verbose_cat(crayon::green('\u2714'), 'No duplicate columns.\n', verbose = verbose)
str <- capture.output(cat('**No duplicate columns.**\n'))
} else {
verbose_cat(crayon::red('\u2716'), ' These column pairs are duplicate:\n ', pairs, '\n', sep = '', verbose = verbose)
str <- capture.output(cat('**These column pairs are duplicate: **\n', pairs, '\n\n',
sep = ''))
}
verbose_cat('\n', verbose = verbose)
str <- c(str, capture.output(cat('\n')))
return(str)
}
#' Search for missing values in the target column and predictors
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param y A string that indicates a target column name for regression or classification.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param time A string that indicates a time column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param status A string that indicates a status column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
check_missing <- function(df, y = NULL, time = NULL, status = NULL, verbose = TRUE) {
# Distinction between survival analysis and other tasks.
if (!is.null(y)) {
df_sm <- df[, !names(df) %in% y]
missing_y <- length(df[[y]][df[[y]] == ''])
} else {
df_sm <- df[, !names(df) %in% c(time, status)]
missing_y <- length(df[df[[time]] == '' & df[[status]] == '', ])
}
missing_x <- 0
for (i in 1:nrow(df_sm)) {
if (length(df[i, ][df[i, ] == '']) != 0) {
missing_x <- missing_x + 1
}
}
if (missing_y == 0) {
verbose_cat(crayon::green('\u2714'), 'No target values are missing. \n\n', verbose = verbose)
str <- capture.output(cat('**No target values are missing. **\n\n'))
} else {
verbose_cat(crayon::red('\u2716'), ' ', missing_y,' Target values are missing. \n\n', sep = '', verbose = verbose)
str <- capture.output(cat(missing_y, ' **Target values are missing.**\n\n', sep = ''))
}
if (missing_x == 0) {
verbose_cat(crayon::green('\u2714'), 'No predictor values are missing. \n', verbose = verbose)
str <- c(str, capture.output(cat('**No predictor values are missing. **\n')))
} else {
verbose_cat(crayon::red('\u2716'), ' ', missing_x, ' observations have missing fields.\n', sep = '', verbose = verbose)
str <- c(str, capture.output(cat('** ', missing_x, ' observations have missing fields.**\n', sep = '')))
}
verbose_cat('\n', verbose = verbose)
str <- c(str, capture.output(cat('\n')))
return(str)
}
#' Search for dimensionality problems in the dataset
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
check_dim <- function(df, verbose = TRUE) {
rows <- dim(df)[1]
cols <- dim(df)[2]
if (cols > 30) {
verbose_cat(crayon::red('\u2716'), ' Too big dimensionality with ', cols, ' colums. Forest models wont use so many of them. \n', sep = '', verbose = verbose)
str <- capture.output(cat('**Too big dimensionality with ', cols, ' colums. Forest models wont use so many of them. **\n', sep = ''))
}
if (cols >= rows) {
verbose_cat(crayon::red('\u2716'), 'More features than observations, try reducing dimensionality or add new observations. \n', verbose = verbose)
str <- capture.output(cat('**More features than observations, try reducing dimensionality or add new observations. **\n'))
}
if (cols < rows && cols <= 30) {
verbose_cat(crayon::green('\u2714'), 'No issues with dimensionality. \n', verbose = verbose)
str <- capture.output(cat('**No issues with dimensionality. **\n'))
}
verbose_cat('\n', verbose = verbose)
str <- c(str, capture.output(cat('\n')))
return(str)
}
#' Search for strongly correlated values (Spearman's rank for numerical, Crammer's V for
#' categorical)
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param y A string that indicates a target column name for regression or classification.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param time A string that indicates a time column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param status A string that indicates a status column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
check_cor <- function(df, y = NULL, time = NULL, status = NULL, verbose = TRUE) {
# Distinction between survival analysis and other tasks.
if (!is.null(y)) {
data <- df[, !names(df) %in% y]
} else {
data <- df[, !names(df) %in% c(time, status)]
}
data <- as.data.frame(unclass(data), stringsAsFactors = TRUE)
num_idx <- c()
fct_idx <- c()
for (i in 1:ncol(data)) {
if (is.factor(data[, i])) {
if (length(levels(data[, i])) > 1) {
# CramerV doesn't work with a singular level, and we check columns
# with a single value before, so no need for an additional warning.
fct_idx <- c(fct_idx, i)
}
} else {
num_idx <- c(num_idx, i)
}
}
num_names <- NULL
cor_num <- NULL
if (length(num_idx) != 0) {
cor_num <- round(cor(data[num_idx]), 2)
no_cor_num <- TRUE
num_names <- colnames(cor_num)
for (i in 1:ncol(cor_num)) {
for (j in i:ncol(cor_num)) {
if (is.na(abs(cor_num[i, j]) >= 0.7)) {
strong_Spearman_cor <- FALSE
} else {
strong_Spearman_cor <- (abs(cor_num[i, j]) >= 0.7)
}
if (i != j && strong_Spearman_cor) {
if (no_cor_num) {
verbose_cat(crayon::red('\u2716'), 'Strongly correlated, by Spearman rank, pairs of numerical values are: \n', verbose = verbose, '\n')
str <- capture.output(cat('**Strongly correlated, by Spearman rank, pairs of numerical values are: **\n\n'))
no_cor_num <- FALSE
}
verbose_cat(' ', num_names[i], ' - ', num_names[j], ': ', cor_num[i, j], ';\n', sep = '', verbose = verbose)
str <- c(str, capture.output(
cat(' ', num_names[i], ' - ', num_names[j], ': ', cor_num[i, j], ';\n', sep = '')))
}
}
}
if (no_cor_num) {
verbose_cat(crayon::green('\u2714'), 'No strongly correlated, by Spearman rank, pairs of numerical values. \n', verbose = verbose)
str <- capture.output(cat('**No strongly correlated, by Spearman rank, pairs of numerical values. **\n\n'))
}
verbose_cat('\n', verbose = verbose)
}
fct_names <- NULL
cor_fct <- NULL
if (length(fct_idx) != 0) {
fct_tbl <- data[fct_idx]
cor_fct <- matrix(0, nrow = length(fct_idx), ncol = length(fct_idx))
colnames(cor_fct) <- names(fct_tbl)
row.names(cor_fct) <- names(fct_tbl)
for (i in 1:length(fct_idx)) {
for (j in 1:length(fct_idx)) {
a = as.numeric(length(unique(fct_tbl[, i])))
b = as.numeric(length(unique(fct_tbl[, j])))
if (as.numeric(a * b) <= 2^29 - 1){ # with 2^31 we have to big vector that can't be allocated (over 7.4 Gb)
cor_fct[i, j] <- round(rcompanion::cramerV(fct_tbl[, i], fct_tbl[, j]), 2)
} else {
cor_fct[i, j] <- NA
verbose_cat(crayon::red('\nWARNING!'), ' Correlation: ', colnames(cor_fct)[i], ' - ', colnames(cor_fct)[j], ' was ommited because of too much unique values. \n', verbose = verbose)
str <- c(str, capture.output(cat('\nWARNING!', ' Correlation: ', colnames(cor_fct)[i], ' - ', colnames(cor_fct)[j], ' was ommited because of too much unique values. **\n')))
}
}
}
no_cor_fct = TRUE
fct_names <- colnames(cor_fct)
for (i in 1:ncol(cor_fct)) {
for (j in i:ncol(cor_fct)) {
if (is.na(abs(cor_fct[i, j]) >= 0.7)) {
strong_V_cor <- FALSE
} else {
strong_V_cor <- (abs(cor_fct[i, j]) >= 0.7)
}
if (i != j && strong_V_cor) {
if (no_cor_fct) {
verbose_cat(crayon::red('\u2716'), 'Strongly correlated, by Crammer\'s V rank, pairs of categorical values are: \n', verbose = verbose)
str <- c(str, capture.output(cat('\n', '**Strongly correlated, by Crammer\'s V rank, pairs of categorical values are: **\n\n')))
no_cor_fct = FALSE
}
verbose_cat(' ', fct_names[i], ' - ', fct_names[j], ': ', cor_fct[i, j], ';\n', sep = '', verbose = verbose)
str <- c(str, capture.output(cat(' ', fct_names[i], ' - ', fct_names[j], ': ', cor_fct[i, j], ';\n', sep = '')))
}
}
}
if (no_cor_fct) {
verbose_cat(crayon::green('\u2714'), 'No strongly correlated, by Crammer\'s V rank, pairs of categorical values. \n', verbose = verbose)
str <- c(str, capture.output(
cat('**No strongly correlated, by Crammer\'s V rank, pairs of categorical values. **\n')))
}
verbose_cat('\n', verbose = verbose)
}
str <- c(str, capture.output(cat('\n')))
return(
list(
str = str,
num_names = num_names,
cor_num = cor_num,
fct_names = fct_names,
cor_fct = cor_fct
)
)
}
#' Search for outliers via mean standard deviation, median absolute deviation
#' and inter quantile range
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
check_outliers <- function(df, verbose = TRUE) {
# Methods from: https://www.reneshbedre.com/blog/find-outliers.html
mean_standard_deviation <- function(x) {
mean <- mean(x)
std <- sd(x)
Tmin <- mean - (3 * std)
Tmax <- mean + (3 * std)
return (which(x < Tmin | x > Tmax))
}
median_absolute_deviation <- function(x) {
med <- median(x)
abs_dev <- abs(x - med)
mad <- 1.4826 * median(abs_dev) # for normally distributed data
Tmin <- med - (3 * mad)
Tmax <- med + (3 * mad)
return (which(x < Tmin | x > Tmax))
}
inter_quantile_range <- function(x) {
Tmin <- summary(x)[2] - (1.5 * IQR(x))
Tmax <- summary(x)[4] + (1.5 * IQR(x))
return (which(x < Tmin | x > Tmax))
}
data <- as.data.frame(unclass(df), stringsAsFactors = TRUE)
num_idx <- c()
for (i in 1:ncol(data)) {
if (!is.factor(data[, i])) {
num_idx <- c(num_idx, i)
}
}
data_num <- data[num_idx]
outliers <- c()
if (ncol(data_num) != 0) {
for (i in 1:ncol(data_num)) {
outliers_tmp <- c()
outliers_tmp <- c(outliers_tmp, mean_standard_deviation(data_num[, i]))
outliers_tmp <- c(outliers_tmp, median_absolute_deviation(data_num[, i]))
outliers_tmp <- c(outliers_tmp, inter_quantile_range(data_num[, i]))
bool_outlier <- table(outliers_tmp)
outliers <- c(outliers, names(bool_outlier[bool_outlier == 3]))
}
outliers <- unique(outliers)
outliers <- sort(outliers)
if (length(outliers) == 0) {
verbose_cat(crayon::green('\u2714'), 'No outliers in the dataset. \n', verbose = verbose)
str <- capture.output(cat('**No outliers in the dataset. **\n'))
} else if (length(outliers) < 50) {
verbose_cat(crayon::red('\u2716'), 'These observations migth be outliers due to their numerical columns values: \n', outliers, ';\n', verbose = verbose)
str <- capture.output(cat('**These observations migth be outliers due to their numerical columns values: **\n\n', outliers, ';\n'))
} else {
verbose_cat(crayon::red('\u2716'), 'There are more than 50 possible outliers in the data set, so we are not printing them. They are returned in the output as a vector. \n', verbose = verbose)
str <- capture.output(cat('**There are more than 50 possible outliers in the data set, so we are not printing them. They are returned in the output as a vector. **\n'))
}
verbose_cat('\n', verbose = verbose)
str <- c(str, capture.output(cat('\n')))
outliers <- as.numeric(outliers)
} else {
verbose_cat(crayon::green('\u2714'), 'No numeric data in the dataset. \n', verbose = verbose)
str <- capture.output(cat('**No numeric data in the dataset. **\n'))
verbose_cat('\n', verbose = verbose)
}
rtr <- list(
str = str,
outliers = outliers
)
return(rtr)
}
#' Check whether the target column is unbalanced (for regression it bins values
#' via quantiles)
#'
#' @param df A data source, that is one of the major R formats: data.table, data.frame,
#' matrix, and so on.
#' @param y A string that indicates a target column name for regression or classification.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param time A string that indicates a time column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param status A string that indicates a status column name for survival analysis task.
#' Either y, or pair: time, status can be used. By default NULL.
#' @param type A character, one of `binary_clf`/`regression`/`survival`/`auto`/`multiclass` that
#' sets the type of the task. If `auto` (the default option) then
#' the function will figure out `type` based on the number of unique values
#' in the `y` variable, or the presence of time/status columns.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#'
#' @export
check_y_balance <- function(df, y = NULL, time = NULL, status = NULL, type = 'auto', verbose = TRUE) {
# This part should not be provided in string for the report, as it only enhances
# the quality of standalone check_data function.
if (type == 'auto') {
type <- guess_type(df, y)
verbose_cat(crayon::green('\u2714'), 'Type guessed as:', type, '\n\n', verbose = verbose)
} else if (!type %in% c('regression', 'binary_clf', 'survival', 'multiclass')) {
verbose_cat(crayon::red('\u2716'), 'Invalid value. Correct task types are: `binary_clf`, `regression`, `survival`, `multiclass`, and `auto` for automatic task identification \n\n', verbose = verbose)
} else {
verbose_cat(crayon::green('\u2714'), 'Type provided as:', type, '\n\n', verbose = verbose)
}
# Distinction between survival analysis and other tasks.
if (!is.null(y)) {
target <- df[[y]]
} else {
target <- df[[status]]
}
if (type %in% c('binary_clf', 'survival')) {
if (table(target)[1] / table(target)[2] > 1.5 || table(target)[1] / table(target)[2] < 0.75) {
if (table(target)[1] > table(target)[2]) {
dominating <- rownames(table(target))[1]
} else {
dominating <- rownames(table(target))[2]
}
verbose_cat(crayon::red('\u2716'), 'Dataset is unbalanced with:', table(target)[1] / table(target)[2], 'proportion with', dominating, 'being a dominating class.\n', verbose = verbose)
str <- capture.output(
cat('**Dataset is unbalanced with:**', table(target)[1] / table(target)[2], 'proportion with', dominating, 'being a dominating class.\n'))
} else {
verbose_cat(crayon::green('\u2714'), 'Dataset is balanced. \n', verbose = verbose)
str <- capture.output(cat('**Dataset is balanced. **\n'))
}
} else if (type == 'regression') {
Q1 <- summary(target)[2]
Mean <- summary(target)[4]
Q2 <- summary(target)[5]
bins <- c(0, 0, 0, 0)
for (i in 1:length(target)) {
if (target[i] < Q1) {
bins[1] <- bins[1] + 1
} else if (target[i] < Mean) {
bins[2] <- bins[2] + 1
} else if (target[i] < Q2) {
bins[3] <- bins[3] + 1
} else {
bins[4] <- bins[4] + 1
}
}
balanced <- TRUE
for (i in 1:4) {
for (j in i:4) {
if (bins[j] == 0) {
if (bins[i] > 0) {
balanced <- FALSE
}
} else {
if (bins[i] / bins[j] > 1.5) {
balanced <- FALSE
}
}
}
}
perc_bins <- round(bins / sum(bins), 2)
if (balanced) {
verbose_cat(crayon::green('\u2714'), 'Target data is evenly distributed. \n', verbose = verbose)
str <- capture.output(cat('**Target data is evenly distributed. **\n'))
} else {
verbose_cat(crayon::red('\u2716'), 'Target data is not evenly distributed with quantile bins:', perc_bins, '\n', verbose = verbose)
str <- capture.output(cat('**Target data is not evenly distributed with quantile bins:**', perc_bins, '\n'))
}
} else if (type == 'multiclass') {
distribution <- table(target) / length(target)
num_class <- length(unique(target))
equal <- 1 / num_class
dominating <- c()
dominating_name <- c()
underrep <- c()
underrep_name <- c()
for (i in 1:length(distribution)) {
if (distribution[i] > 1.5 * equal) {
dominating_name <- c(dominating_name, names(distribution)[i])
dominating <- c(dominating, distribution[i])
} else if (distribution[i] < 0.75 * equal) {
underrep_name <- c(underrep_name, names(distribution)[i])
underrep <- c(underrep, distribution[i])
}
}
if (length(dominating) > 0 || length(underrep) > 0) {
verbose_cat(crayon::red('\u2716'), 'Target data is not evenly distributed. \n', verbose = verbose)
str <- capture.output(cat('**Target data is not evenly distributed. **\n\n'))
if (length(dominating) > 0) {
verbose_cat(crayon::red('\u2716'), 'The dominating classes are:', dominating_name, 'with shares equal to:', dominating,
'where the targeted ratio is', equal, '\n', verbose = verbose)
str <- c(str, capture.output(cat('**The dominating classes are:**', dominating_name, '\n\n**With shares equal to:**', dominating,
'\n\n**Where the targeted ratio is**', equal, '\n\n')))
}
if (length(underrep) > 0) {
verbose_cat(crayon::red('\u2716'), 'The underrepresented classes are:', underrep_name, 'with shares equal to:', underrep,
'where the targeted ratio is', equal, '\n', verbose = verbose)
str <- c(str, capture.output(cat('**The underrepresented classes are:**', underrep_name, '\n\n**With shares equal to:**', underrep,
'\n\n**Where the targeted ratio is**', equal, '\n')))
}
} else {
verbose_cat(crayon::green('\u2714'), 'Target data is evenly distributed. \n', verbose = verbose)
str <- capture.output(cat('**Target data is evenly distributed. **\n'))
}
}
verbose_cat('\n', verbose = verbose)
str <- c(str, capture.output(cat('\n')))
return(str)
}
#' Detect columns that are ID-like columns
#'
#' @param data A data source before preprocessing, that is one of the major R formats:
#' data.table, data.frame, matrix, and so on.
#' @param verbose A logical value, if set to TRUE, provides all information about
#' the process, if FALSE gives none.
#'
#' @return A list with every line of the sub-report.
#' @export
detect_id_columns <- function(data, verbose = TRUE) {
names <- colnames(data)
id_names <- c('id', 'nr', 'number', 'idx', 'identification', 'index')
sus_names <- c()
sus_data <- c()
for (i in 1:ncol(data)) {
if (tolower(names[i]) %in% id_names) {
sus_names <- c(sus_names, names[i])
}
if (isTRUE(all.equal(data[, i], as.integer(data[, i]))) &&
length(unique(data[, i])) == nrow(data)) {
sus_data <- c(sus_data, names[i])
}
}
if (length(sus_names) > 0) {
verbose_cat(crayon::red('\u2716'), 'Columns names suggest that some of them are IDs, removing them can improve the model.\n Suspicious columns are:',
sus_names, '.\n\n', verbose = verbose)
str <- capture.output(
cat('**Columns names suggest that some of them are IDs, removing them can improve the model. Suspicious columns are: **\n\n',
sus_names, '\n\n'))
} else {
verbose_cat(crayon::green('\u2714'), 'Columns names suggest that none of them are IDs. \n\n', verbose = verbose)
str <- capture.output(
cat('**Columns names suggest that none of them are IDs. **\n\n'))
}
if (length(sus_data) > 0) {
verbose_cat(crayon::red('\u2716'), 'Columns data suggest that some of them are IDs, removing them can improve the model.\n Suspicious columns are:',
sus_data, '.\n', verbose = verbose)
str <- c(str, capture.output(
cat('**Columns data suggest that some of them are IDs, removing them can improve the model. Suspicious columns are: **\n\n',
sus_data, '\n\n')))
} else {
verbose_cat(crayon::green('\u2714'), 'Columns data suggest that none of them are IDs. \n', verbose = verbose)
str <- c(str, capture.output(
cat('**Columns data suggest that none of them are IDs. **\n\n')))
}
verbose_cat('\n', verbose = verbose)
str <- c(str, capture.output(cat('\n')))
return(str)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.