Diabetes in Pima Indian Women

Characteristics of Pima Indian women tested for diabetes are used in this example to predict their disease statuses.

library(mlbench)
data(PimaIndiansDiabetes2)

stats <- list(
  list("Number of women" = ~ length(diabetes)),
  "diabetes" = list("pos" = ~ n_perc(diabetes == "pos"),
                    "neg" = ~ n_perc(diabetes == "neg")),
  "pregnant" = list("Median (Range)" = ~ median_range(pregnant)),
  "glucose" = list("Median (Range)" = ~ median_range(glucose)),
  "pressure" = list("Median (Range)" = ~ median_range(pressure)),
  "triceps" = list("Median (Range)" = ~ median_range(triceps)),
  "insulin" = list("Median (Range)" = ~ median_range(insulin)),
  "mass" = list("Median (Range)" = ~ median_range(mass)),
  "pedigree" = list("Median (Range)" = ~ median_range(pedigree)),
  "age" = list("Median (Range)" = ~ median_range(age))
)

summary_kbl(stats, na.omit(PimaIndiansDiabetes2))

Training Set Analysis

## Analysis libraries
library(MachineShop)
library(ggplot2)

## Parallel processing
library(doParallel)
registerDoParallel(cores = 6)

## Dataset
data(PimaIndiansDiabetes2, package = "mlbench")
Pima <- na.omit(PimaIndiansDiabetes2)

## Model formula
fo <- diabetes ~ .

## Model selected from tuned models
selected_model <- SelectedModel(TunedModel(KNNModel, grid = 5),
                                TunedModel(NNetModel, grid = 5),
                                TunedModel(RandomForestModel, grid = 5))

## Model fit
model_fit <- fit(fo, data = Pima, model = selected_model)

## Variable importance
vi <- varimp(model_fit)
plot(vi)

Generalization Performance

## Resample estimation with cross-validation
res <- resample(fo, data = Pima, model = selected_model, control = CVControl)

## Estimated performance
summary(performance(res))

## Variable probability cutoff
summary(performance(res, cutoff = 0.25))

ROC Curve

## True positive and false positive rates over all probability cutoffs
roc <- performance_curve(res)

## ROC curve
plot(roc, diagonal = TRUE) + coord_fixed()
## Area under the curve
auc(roc)

Confusion Matrices

(conf <- confusion(res))

summary(conf)

plot(conf)
## Variable probability cutoff
summary(confusion(res, cutoff = 0.25))

Calibration Curve

cal <- calibration(res, breaks = NULL)
plot(cal, se = TRUE)

Partial Dependence Plots

pd <- dependence(model_fit, select = c(glucose, age, insulin))
plot(pd)


brian-j-smith/MachineShop documentation built on Sept. 22, 2023, 10:01 p.m.