#
# Copyright (C) 2013-2021 University of Amsterdam
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
mlClassificationLogisticMultinomial <- function(jaspResults, dataset, options, ...) {
# Preparatory work
dataset <- .mlClassificationReadData(dataset, options)
.mlClassificationErrorHandling(dataset, options, type = "logistic")
# Check if analysis is ready to run
ready <- .mlClassificationReady(options, type = "logistic")
# Compute results and create the model summary table
.mlClassificationTableSummary(dataset, options, jaspResults, ready, position = 1, type = "logistic")
# If the user wants to add the classes to the data set
.mlClassificationAddPredictionsToData(dataset, options, jaspResults, ready)
# Add test set indicator to data
.mlAddTestIndicatorToData(options, jaspResults, ready, purpose = "classification")
# Create the data split plot
.mlPlotDataSplit(dataset, options, jaspResults, ready, position = 2, purpose = "classification", type = "logistic")
# Create the confusion table
.mlClassificationTableConfusion(dataset, options, jaspResults, ready, position = 3)
# Create the class proportions table
.mlClassificationTableProportions(dataset, options, jaspResults, ready, position = 4)
# Create the validation measures table
.mlClassificationTableMetrics(dataset, options, jaspResults, ready, position = 5)
# Create the variable importance table
.mlTableFeatureImportance(options, jaspResults, ready, position = 6, purpose = "classification")
# Create the shap table
.mlTableShap(dataset, options, jaspResults, ready, position = 7, purpose = "classification")
.mlClassificationLogisticTableCoef(options, jaspResults, ready, position = 8)
# Create the ROC curve
.mlClassificationPlotRoc(dataset, options, jaspResults, ready, position = 10, type = "logistic") # position + 1 for regression equation
# Create the Andrews curves
.mlClassificationPlotAndrews(dataset, options, jaspResults, ready, position = 11)
# Decision boundaries
.mlClassificationPlotBoundaries(dataset, options, jaspResults, ready, position = 12, type = "logistic")
}
.logisticMultinomialClassification <- function(dataset, options, jaspResults, ready) {
# Import model formula from jaspResults
formula <- jaspResults[["formula"]]$object
# Split the data into training and test sets
if (options[["holdoutData"]] == "testSetIndicator" && options[["testSetIndicatorVariable"]] != "") {
# Select observations according to a user-specified indicator (included when indicator = 1)
trainingIndex <- which(dataset[, options[["testSetIndicatorVariable"]]] == 0)
} else {
# Sample a percentage of the total data set
trainingIndex <- sample.int(nrow(dataset), size = ceiling((1 - options[["testDataManual"]]) * nrow(dataset)))
}
trainingSet <- dataset[trainingIndex, ]
# Create the generated test set indicator
testIndicatorColumn <- rep(1, nrow(dataset))
testIndicatorColumn[trainingIndex] <- 0
# Just create a train and a test set (no optimization)
testSet <- dataset[-trainingIndex, ]
# Check for factor levels in the test set that are not in the training set
.checkForNewFactorLevelsInPredictionSet(trainingSet, testSet, "test")
# Create the formula
if (options[["intercept"]]) {
formula <- formula(paste(options[["target"]], "~ 1 + ", paste(options[["predictors"]], collapse = " + ")))
} else {
formula <- formula(paste(options[["target"]], "~ 0 + ", paste(options[["predictors"]], collapse = " + ")))
}
if (nlevels(trainingSet[[options[["target"]]]]) == 2) {
family = "binomial"
linkFunction <- options[["link"]]
trainingFit <- stats::glm(formula, data = trainingSet, family = stats::binomial(link = linkFunction))
# Use the specified model to make predictions for dataset
testPredictions <- .mlClassificationLogisticPredictions(trainingSet, options, stats::predict(trainingFit, newdata = testSet, type = "response"))
dataPredictions <- .mlClassificationLogisticPredictions(trainingSet, options, stats::predict(trainingFit, newdata = dataset, type = "response"))
} else {
family <- "multinomial"
linkFunction <- "logit"
trainingFit <- VGAM::vglm(formula, data = trainingSet, family = VGAM::multinomial())
# Use the specified model to make predictions for dataset
testPredictions <- .mlClassificationMultinomialPredictions(trainingSet, options, VGAM::predict(trainingFit, newdata = testSet))
dataPredictions <- .mlClassificationMultinomialPredictions(trainingSet, options, VGAM::predict(trainingFit, newdata = dataset))
}
# Create results object
result <- list()
result[["formula"]] <- formula
result[["family"]] <- family
result[["link"]] <- linkFunction
if (family == "binomial") {
result[["model"]] <- trainingFit
result[["model"]]$link <- result[["link"]]
} else {
model <- lapply(slotNames(trainingFit), function(x) slot(trainingFit, x))
names(model) <- slotNames(trainingFit)
model[["original"]] <- trainingFit
model[["target"]] <- trainingSet[[options[["target"]]]]
class(model) <- "vglm"
result[["model"]] <- model
}
result[["confTable"]] <- table("Pred" = testPredictions, "Real" = testSet[, options[["target"]]])
result[["testAcc"]] <- sum(diag(prop.table(result[["confTable"]])))
result[["auc"]] <- .classificationCalcAUC(testSet, trainingSet, options, "logisticClassification")
result[["ntrain"]] <- nrow(trainingSet)
result[["ntest"]] <- nrow(testSet)
result[["testReal"]] <- testSet[, options[["target"]]]
result[["testPred"]] <- testPredictions
result[["train"]] <- trainingSet
result[["test"]] <- testSet
result[["testIndicatorColumn"]] <- testIndicatorColumn
result[["classes"]] <- dataPredictions
if (family == "binomial") {
result[["explainer"]] <- DALEX::explain(result[["model"]], type = "classification", data = result[["train"]], y = result[["train"]][, options[["target"]]], predict_function = function(model, data) data.frame(1 - predict(model, newdata = data, type = "response"), predict(model, newdata = data, type = "response")))
result[["explainer_fi"]] <- DALEX::explain(result[["model"]], type = "classification", data = result[["train"]], y = as.numeric(result[["train"]][, options[["target"]]]) - 1, predict_function = function(model, data) round(predict(model, newdata = data, type = "response"), 0) + 1)
} else {
result[["explainer"]] <- DALEX::explain(result[["model"]][["original"]], type = "multiclass", data = result[["train"]], y = result[["train"]][, options[["target"]]], predict_function = function(model, data) VGAM::predict(model, data, type = "response"))
result[["explainer_fi"]] <- result[["explainer"]]
}
return(result)
}
.mlClassificationLogisticTableCoef <- function(options, jaspResults, ready, position) {
if (!is.null(jaspResults[["coefTable"]]) || !options[["coefTable"]]) {
return()
}
table <- createJaspTable(gettext("Regression Coefficients"))
table$position <- position
table$dependOn(options = c("coefTable", "coefTableConfInt", "coefTableConfIntLevel", "formula", .mlClassificationDependencies()))
table$addColumnInfo(name = "var", title = "", type = "string")
table$addColumnInfo(name = "coefs", title = gettextf("Coefficient (%s)", "\u03B2"), type = "number")
table$addColumnInfo(name = "se", title = gettext("Standard Error"), type = "number")
table$addColumnInfo(name = "z", title = gettext("z"), type = "number")
table$addColumnInfo(name = "p", title = gettext("p"), type = "pvalue")
if (options[["coefTableConfInt"]]) {
overtitle <- gettextf("%1$s%% Confidence interval", round(options[["coefTableConfIntLevel"]] * 100, 3))
table$addColumnInfo(name = "lower", title = gettext("Lower"), type = "number", overtitle = overtitle)
table$addColumnInfo(name = "upper", title = gettext("Upper"), type = "number", overtitle = overtitle)
}
if (options[["scaleVariables"]]) {
table$addFootnote(gettext("The regression coefficients for numeric features are standardized."))
} else {
table$addFootnote(gettext("The regression coefficients are unstandardized."))
}
jaspResults[["coefTable"]] <- table
if (!ready) {
if (options[["target"]] == "" && length(unlist(options[["predictors"]])) > 0) {
table[["var"]] <- c(if (options[["intercept"]]) "(Intercept)" else NULL, options[["predictors"]])
}
return()
}
classificationResult <- jaspResults[["classificationResult"]]$object
model <- classificationResult[["model"]]
if (classificationResult[["family"]] == "binomial") {
estimates <- coef(summary(model))
conf_int <- confint(model, level = options[["coefTableConfIntLevel"]])
if (!options[["intercept"]] && length(options[["predictors"]] == 1)) {
coefs <- cbind(estimates, conf_int[1], conf_int[2])
} else {
coefs <- cbind(estimates, conf_int)
}
colnames(coefs) <- c("est", "se", "z", "p", "lower", "upper")
vars <- rownames(coefs)
for (i in seq_along(vars)) {
if (!(vars[i] %in% options[["predictors"]]) && vars[i] != "(Intercept)") {
for (j in options[["predictors"]]) {
vars[i] <- gsub(pattern = j, replacement = paste0(j, " ("), x = vars[i])
}
vars[i] <- paste0(vars[i], ")")
}
}
rownames(coefs) <- vars
} else {
estimates <- VGAM::coef(VGAM::summaryvglm(model[["original"]]))
conf_int <- confint(model[["original"]], level = options[["coefTableConfIntLevel"]])
coefs <- cbind(estimates, conf_int)
colnames(coefs) <- c("est", "se", "z", "p", "lower", "upper")
vars <- rownames(coefs)
for (i in seq_along(vars)) {
for (j in c("(Intercept)", options[["predictors"]])) {
if (!grepl(j, vars[i])) {
next
}
splitvar <- strsplit(vars[i], split = ":")[[1]]
if (grepl(paste0(j, "[A-Za-z]+:"), vars[i])) {
repl_part1 <- paste0(gsub(pattern = j, replacement = paste0(j, " ("), x = splitvar[1]), ")")
} else {
repl_part1 <- j
}
repl_part2 <- levels(factor(classificationResult[["train"]][[options[["target"]]]]))[as.numeric(splitvar[2])]
vars[i] <- paste0(repl_part1, " : ", repl_part2)
}
}
rownames(coefs) <- vars
}
table[["var"]] <- rownames(coefs)
table[["coefs"]] <- as.numeric(coefs[, "est"])
table[["se"]] <- as.numeric(coefs[, "se"])
table[["z"]] <- as.numeric(coefs[, "z"])
table[["p"]] <- as.numeric(coefs[, "p"])
if (options[["coefTableConfInt"]]) {
table[["lower"]] <- coefs[, "lower"]
table[["upper"]] <- coefs[, "upper"]
}
if (options[["formula"]]) {
if (classificationResult[["family"]] == "binomial") {
one_cat <- levels(factor(classificationResult[["train"]][[options[["target"]]]]))[2]
if (options[["intercept"]]) {
regform <- paste0(options[["link"]], "(p<sub>", options[["target"]], " = ", one_cat, "</sub>) = ", round(as.numeric(coefs[, 1])[1], 3))
start <- 2
} else {
regform <- paste0(options[["link"]], "(p<sub>", options[["target"]], " = ", one_cat, "</sub>) = ")
start <- 1
}
for (i in start:nrow(coefs)) {
regform <- paste0(regform, if (round(as.numeric(coefs[, 1])[i], 3) < 0) " - " else (if (!options[["intercept"]] && i == 1) "" else " + "), abs(round(as.numeric(coefs[, 1])[i], 3)), " x ", rownames(coefs)[i])
}
} else {
regform <- NULL
nlevs <- nlevels(classificationResult[["train"]][[options[["target"]]]])
baseline_cat <- levels(classificationResult[["train"]][[options[["target"]]]])[nlevs]
for (i in seq_len(nlevs - 1)) {
current_cat <- levels(classificationResult[["train"]][[options[["target"]]]])[i]
if (options[["intercept"]]) {
part <- paste0("log(p<sub>", options[["target"]], " = ", current_cat, "</sub> / p<sub>", options[["target"]], " = ", baseline_cat, "</sub>) = ", round(as.numeric(coefs[, 1])[i], 3))
start <- nlevs - 1 + i
} else {
part <- paste0("log(p<sub>", options[["target"]], " = ", current_cat, "</sub> / p<sub>", options[["target"]], " = ", baseline_cat, "</sub>) = ")
start <- i
}
for (j in seq(start, nrow(coefs), by = nlevs - 1)) {
part <- paste0(part, if (round(as.numeric(coefs[, 1])[j], 3) < 0) " - " else (if (!options[["intercept"]] && j == i) "" else " + "), abs(round(as.numeric(coefs[, 1])[j], 3)), " x ", strsplit(rownames(coefs)[j], " : ")[[1]][1])
}
if (i == 1) {
regform <- paste0(regform, part, "\n\n")
} else {
regform <- paste0(regform, part)
}
}
}
formula <- createJaspHtml(gettextf("<b>Regression equation:</b>\n%1$s", regform), "p")
formula$position <- position + 1
formula$dependOn(options = c("coefTable", "formula"), optionsFromObject = jaspResults[["classificationResult"]])
jaspResults[["regressionFormula"]] <- formula
}
}
.mlClassificationLogisticPredictions <- function(trainingSet, options, probabilities) {
categories <- levels(trainingSet[[options[["target"]]]])
predicted_categories <- factor(categories[round(probabilities, 0) + 1], levels = levels(trainingSet[[options[["target"]]]]))
return(predicted_categories)
}
.mlClassificationMultinomialPredictions <- function(trainingSet, options, logodds) {
ncategories <- ncol(logodds) + 1
probabilities <- matrix(0, nrow = nrow(logodds), ncol = ncategories)
for (i in seq_len(ncategories - 1)) {
probabilities[, i] <- exp(logodds[, i])
}
probabilities[, ncategories] <- 1
row_sums <- rowSums(probabilities)
probabilities <- probabilities / row_sums
predicted_columns <- apply(probabilities, 1, which.max)
categories <- levels(trainingSet[[options[["target"]]]])
predicted_categories <- factor(categories[predicted_columns], levels = levels(trainingSet[[options[["target"]]]]))
return(predicted_categories)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.