#' Perform standard data checking and tidying actions before fitting a machine learning model
#'
#' @importFrom flipData GetData EstimationData EstimationDataTemplate DataFormula ErrorIfInfinity
#' @importFrom flipFormat Labels
#' @importFrom flipU OutcomeName AllVariablesNames
#' @importFrom flipTransformations AdjustDataToReflectWeights
#' @importFrom stats reformulate var
#' @noRd
prepareMachineLearningData <- function(formula, data, subset, subset.description,
weights, weights.description, missing, seed,
bootstrap.weights = TRUE, dummy = FALSE,
strict.var.names = FALSE,
allow.single.categories = TRUE)
{
# To work past scoping issues in car package:
# https://cran.r-project.org/web/packages/car/vignettes/embedding.pdf.
input.formula <- formula
if (!is.null(subset))
{
if (is.null(subset.description) || inherits(subset.description, "try-error") ||
!is.null(attr(subset, "name")) || length(subset.description) > 0)
subset.description <- Labels(subset)
if (is.null(attr(subset, "name")))
attr(subset, "name") <- subset.description
}
if (!is.null(weights))
{
if (is.null(weights.description) || inherits(weights.description, "try-error") ||
!is.null(attr(weights, "name")) || length(weights.description) > 0)
weights.description <- Labels(weights)
if (is.null(attr(weights, "name")))
attr(weights, "name") <- weights.description
}
data <- GetData(input.formula, data, auxiliary.data = NULL)
if (!is.null(data) && any(character.vars <- vapply(data, is.character, logical(1L)))) {
data[character.vars] <- lapply(data[character.vars], as.factor)
labels <- Labels(data, names(which(character.vars)))
pasted.labels <- paste0(dQuote(labels), collapse = ", ")
warning(sQuote("data"), " argument contains character variables (", pasted.labels, "). ",
"These have been changed to categorical factor variables for analysis.")
}
# randomForest fails with data when variable names contain "$" even if surrounded by backticks
if (strict.var.names)
{
vars <- AllVariablesNames(input.formula, data = data)
## strip off data set name so names look prettier in
## htmlwidget output if no label attribute is available for some variables
vars <- sub("^[[:print:]]*[$](Variables|Questions)[$]", "", vars)
vars <- sub("^`", "", vars)
vars <- sub("`$", "", vars)
vars <- make.names(vars)
cnames <- colnames(data)
cnames <- sub("^[[:print:]]*[$](Variables|Questions)[$]", "", cnames)
cnames <- sub("^`", "", cnames)
cnames <- sub("`$", "", cnames)
cnames <- make.names(cnames)
colnames(data) <- cnames
input.formula <- formula(paste0(vars[1], "~", paste0(vars[-1], collapse = "+")))
}
row.names <- rownames(data)
outcome.name <- OutcomeName(input.formula, data)
outcome.i <- match(outcome.name, names(data))
# Create EstimationDataTemplate as the data has not been broken into dummy variable (0/1)
# coding and the outcome name has been standardized for randomForest
estimation.data.template <- EstimationDataTemplate(data, outcome.name = outcome.name)
if (dummy)
{
factor.levels <- lapply(data[, -outcome.i], levels)
# convert factors with N levels to N-1 binary variables, used by LDA
data <- cbind(data[outcome.i], AsNumeric(data[, -outcome.i, drop = FALSE], binary = TRUE, remove.first = TRUE))
# remove constant variables caused by unpopulated levels
ErrorIfInfinity(data)
data <- data[, c(TRUE, sapply(data[, -1, drop = FALSE], var, na.rm = TRUE) != 0)]
attr(data, "factor.levels") <- factor.levels
}
if (setequal(data[!is.na(data[, outcome.i]), outcome.i], c(0, 1)))
data[, outcome.i] <- as.factor(data[, outcome.i])
outcome.variable <- data[, outcome.i]
numeric.outcome <- !is.factor(outcome.variable)
variable.labels <- Labels(data)
outcome.label <- variable.labels[outcome.i]
if (outcome.label == "data[, outcome.name]")
outcome.label <- outcome.name
if (!is.null(weights) && length(weights) != nrow(data))
stop("'weights' and 'data' are required to have the same number of observations. They do not.")
if (!is.null(subset) && length(subset) > 1 && length(subset) != nrow(data))
stop("'subset' and 'data' are required to have the same number of observations. They do not.")
# Treatment of missing values.
# rebuild formula because there are new variables for each level of factorial predictors
if (dummy) {
predictor.names <- colnames(data)[colnames(data) != outcome.name]
input.formula <- reformulate(predictor.names, response = outcome.name,
env = environment(input.formula))
}
processed.data <- EstimationData(input.formula, data, subset, weights, missing, seed = seed)
unweighted.training.data <- processed.data$estimation.data
ErrorIfInfinity(unweighted.training.data)
n <- nrow(unweighted.training.data)
if (n <= ncol(unweighted.training.data))
stop("The sample size is too small. There should be more samples than predictor variables, but there are ",
n, " samples and ", ncol(unweighted.training.data), " predictors.")
cleaned.weights <- processed.data$weights
# Resampling to generate a weighted sample, if necessary.
weighted.training.data <- if (is.null(weights))
unweighted.training.data
else if (!bootstrap.weights)
NULL
else
AdjustDataToReflectWeights(unweighted.training.data, cleaned.weights)
level.counts <- vapply(weighted.training.data, nlevels, integer(1L))
if (!allow.single.categories && any(level.counts == 1))
{
stop("Categorical predictors must have more than one category, after applying any ",
"filter, weights and missing data treatment. This is not the case for: ",
paste(names(level.counts)[level.counts == 1], collapse = ", "), ". ",
"Please remove those variables to proceed.")
}
list(
unweighted.training.data = unweighted.training.data,
weighted.training.data = weighted.training.data,
estimation.data.template = estimation.data.template,
required.data = data,
imputed.data = processed.data$data,
cleaned.weights = cleaned.weights,
data.formula = DataFormula(input.formula, data),
row.names = row.names,
unfiltered.weights = processed.data$unfiltered.weights,
outcome.name = outcome.name,
sample.description = processed.data$description,
n = n,
numeric.outcome = numeric.outcome,
outcome.label = outcome.label,
variable.labels = variable.labels,
outcome.i = outcome.i,
input.formula = input.formula,
subset.missing.removed = processed.data$post.missing.data.estimation.sample
)
}
#' Save standard data after fitting a machine learning model
#'
#' @importFrom flipU IsCount
#' @importFrom flipRegression ConfusionMatrix
#' @noRd
saveMachineLearningResults <- function(result, prepared.data, show.labels)
{
# Save data, subset and weights
result$model <- prepared.data$required.data
result$subset <- prepared.data$subset.missing.removed # original subset with missing data removed
result$weights <- prepared.data$unfiltered.weights
result$formula <- prepared.data$input.formula
# Save descriptive information.
class(result) <- c("MachineLearning", class(result))
result$outcome.name <- prepared.data$outcome.name
result$sample.description <- prepared.data$sample.description
result$n.observations <- prepared.data$n
result$n.predictors <- ncol(prepared.data$unweighted.training.data) - 1
result$estimation.data <- prepared.data$unweighted.training.data
result$numeric.outcome <- prepared.data$numeric.outcome
result$outcome.levels <- levels(result$estimation.data[result$outcome.name][[1]])
# Replacing names with labels
if (result$show.labels <- show.labels)
{
result$outcome.label <- prepared.data$outcome.label
result$variable.labels <- prepared.data$variable.labels[-prepared.data$outcome.i]
}
else
{
result$outcome.label <- result$outcome.name
result$variable.labels <- names(prepared.data$required.data[-prepared.data$outcome.i])
}
# Save confusion matrix
result$confusion <- ConfusionMatrix(result, result$subset, prepared.data$unfiltered.weights)
return(result)
}
#' @importFrom flipFormat FormatAsPercent
correctPredictionsText <- function(overall.accuracy, group.labels, group.accuracies, digits = 4, out.of.bag = FALSE)
{
prefix <- if (out.of.bag)
"Correct predictions (based on out-of-bag sample): "
else
"Correct predictions: "
paste0(prefix, FormatAsPercent(overall.accuracy, digits), " (",
paste0(group.labels, ": " , FormatAsPercent(group.accuracies, digits), collapse = "; "),
")" )
}
# Compute performance metrics for a model with numeric outcome.
# Calculation uses cases when both pred and obs are not NA.
# The alternative definition of r^2 as cor(obs, pred)^2 is
# only valid when optimizing sum of squared errors.
# https://stats.stackexchange.com/questions/83826/is-a-weighted-r2-in-robust-linear-model-meaningful-for-goodness-of-fit-analys
#' @importFrom verbs Sum SumEmptyHandling
numericOutcomeMetrics <- function(obs, pred, weights) {
obs.and.pred <- complete.cases(obs, pred)
if (SumEmptyHandling(obs.and.pred, remove.missing = FALSE) == 0)
return(list(rmse = NA, r.squared = NA))
if (is.null(weights))
weights <- rep(1, length(obs))
obs <- obs[obs.and.pred]
pred <- pred[obs.and.pred]
weights <- weights[obs.and.pred]
sse <- Sum(weights * (obs - pred)^2, remove.missing = FALSE)
rmse <- sqrt(sse / Sum(weights, remove.missing = FALSE))
weighted.mean <- Sum(obs * weights, remove.missing = FALSE) / Sum(weights, remove.missing = FALSE)
sst <- Sum(weights * (obs - weighted.mean)^2, remove.missing = FALSE)
r.sq <- 1 - sse /sst
if (r.sq < 0)
r.sq <- NA
return(list(rmse = rmse, r.squared = r.sq))
}
# Returns table for printing accuracy summary
#' @importFrom verbs SumEachRow
calcAccuracy <- function(x)
{
if (!x$numeric.outcome)
{
confM <- x$confusion
class.cor <- diag(confM) / SumEachRow(confM, remove.missing = FALSE)
class.cor <- as.matrix(class.cor * 100, row.names = colnames(confM))
colnames(class.cor) <- "Accuracy by class (%)"
return(class.cor)
}
metrics <- numericOutcomeMetrics(Observed(x)[x$subset],
predict(x)[x$subset],
x$weights[x$subset])
c("Root Mean Squared Error" = metrics$rmse,
"R-squared" = metrics$r.squared)
}
#' @importFrom verbs Sum
formatAccuracy <- function(x, algorithm)
{
output.data <- calcAccuracy(x)
title <- paste0(algorithm, ": ", x$outcome.label)
predictors <- x$variable.labels
extracted <- ExtractCommonPrefix(predictors)
if (!is.na(extracted$common.prefix))
{
predictors <- extracted$shortened.labels
}
predictors <- paste(predictors, collapse = ", ")
if (!x$numeric.outcome)
{
tot.cor <- Sum(diag(x$confusion), remove.missing = FALSE) / Sum(x$confusion, remove.missing = FALSE)
subtitle <- sprintf("Overall Accuracy: %.2f%%", tot.cor * 100)
tbl <- DeepLearningTable(output.data,
column.labels = "Accuracy by class (%)",
order.values = FALSE,
title = title,
subtitle = paste(subtitle, " (Predictors: ", predictors, ")", sep = ""),
footer = x$sample.description)
}
else
{
subtitle <- "Measure of fit"
tbl <- DeepLearningTable(output.data,
column.labels = " ",
order.values = FALSE,
title = title,
subtitle = paste(subtitle, " (Predictors: ", predictors, ")", sep = ""),
footer = x$sample.description)
}
return(tbl)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.