Introduction

The goal of this article is to create a machine learning model able to classify the distribution family of the data.

Methods

Parameters and Packages

# Plotting
library(ggplot2)

# Modelling
library(caret)
library(strip)
# library(parsnip)
# library(rsample)

# Optimizing
library(parallel)
library(doParallel)

# Analyzing
library(parameters)
library(bayestestR)
library(report)

set.seed(333)

Visualisation of distributions

distributions <- data.frame()
size <- 1000
for (location in round(seq(0.01, 10, length.out = 5), digits = 2)) {
  for (scale in round(seq(0.02, 10, length.out = 5), digits = 2)) {
    x <- parameters::normalize(as.data.frame(density(runif(size, location, location * 2), n = 100)))
    x$distribution <- "uniform"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rnorm(size, location, scale), n = 100)))
    x$distribution <- "normal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rbeta(size, location, scale), n = 100)))
    x$distribution <- "beta"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rbinom(size, round(location) + 1, scale / 10^(nchar(round(scale)))), n = 100)))
    x$distribution <- "binomial"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rchisq(size, location, scale), n = 100)))
    x$distribution <- "chi"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rexp(size, scale), n = 100)))
    x$distribution <- "exponential"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rf(size, location, scale), n = 100)))
    x$distribution <- "F"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rgamma(size, location, scale), n = 100)))
    x$distribution <- "gamma"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rlnorm(size, location, scale), n = 100)))
    x$distribution <- "lognormal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rpois(size, location), n = 100)))
    x$distribution <- "poisson"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rt(size, location, scale), n = 100)))
    x$distribution <- "t"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rweibull(size, location, scale), n = 100)))
    x$distribution <- "weibull"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
  }
}
ggplot(distributions, aes(x = x, y = y, colour = distribution)) +
  geom_line(size = 1) +
  facet_grid(location ~ scale) +
  theme_classic()

Convenience function

generate_distribution <- function(family = "normal", size = 1000, noise = 0, location = 0, scale = 1) {
  if (family == "normal") {
    x <- rnorm(size, location, scale)
  } else if (family == "uniform") {
    x <- runif(size, location, location * 2)
  } else if (family == "beta") {
    x <- rbeta(size, location, scale)
  } else if (family == "binomial") {
    x <- rbinom(size, round(location) + 1, scale / 10^(nchar(round(scale))))
  } else if (family == "chi") {
    x <- rchisq(size, location, scale)
  } else if (family == "exponential") {
    x <- rexp(size, scale)
  } else if (family == "F") {
    x <- rf(size, location, scale + 0.1)
  } else if (family == "gamma") {
    x <- rgamma(size, location, scale)
  } else if (family == "lognormal") {
    x <- rlnorm(size, location, scale)
  } else if (family == "poisson") {
    x <- rpois(size, location)
  } else if (family == "t") {
    x <- rt(size, location, scale)
  } else if (family == "weibull") {
    x <- rweibull(size, location, scale)
  }
  return(x)
}

Results

Data generation

df <- data.frame()
for (distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")) {
  for (i in 1:2000) {
    size <- round(runif(1, 10, 2000))
    location <- runif(1, 0.01, 10)
    scale <- runif(1, 0.02, 10)

    x <- generate_distribution(distribution, size = size, location = location, scale = scale)

    x[is.infinite(x)] <- 5.565423e+156
    x_scaled <- parameters::normalize(x, verbose = FALSE)

    density_Z <- density(x_scaled, n = 20)$y

    # Extract features
    data <- data.frame(
      "Mean" = mean(x_scaled),
      "SD" = sd(x_scaled),
      "Median" = median(x_scaled),
      "MAD" = mad(x_scaled, constant = 1),
      "Mean_Median_Distance" = mean(x_scaled) - median(x_scaled),
      "Mean_Mode_Distance" = mean(x_scaled) - as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      "SD_MAD_Distance" = sd(x_scaled) - mad(x_scaled, constant = 1),
      "Mode" = as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      # "Range" = range(x),
      "Range_SD" = diff(range(x)) / sd(x),
      "Range_MAD" = diff(range(x)) / mad(x, constant = 1),
      "IQR" = stats::IQR(x_scaled),
      "Skewness" = skewness(x_scaled),
      "Kurtosis" = kurtosis(x_scaled),
      "Uniques" = length(unique(x)) / length(x),
      "CI_50_high" = ci(x_scaled, ci = 0.5)$CI_high,
      "CI_50_low" = ci(x_scaled, ci = 0.5)$CI_low,
      "CI_70_high" = ci(x_scaled, ci = 0.7)$CI_high,
      "CI_70_low" = ci(x_scaled, ci = 0.7)$CI_low,
      "CI_90_high" = ci(x_scaled, ci = 0.9)$CI_high,
      "CI_90_low" = ci(x_scaled, ci = 0.9)$CI_low,
      "Smoothness_Cor" = parameters::smoothness(density(x_scaled)$y, method = "cor"),
      "Smoothness_Diff" = parameters::smoothness(density(x_scaled)$y, method = "diff"),
      "Smoothness_Z_Cor_1" = parameters::smoothness(density_Z, method = "cor", lag = 1),
      "Smoothness_Z_Diff_1" = parameters::smoothness(density_Z, method = "diff", lag = 1),
      "Smoothness_Z_Cor_3" = parameters::smoothness(density_Z, method = "cor", lag = 3),
      "Smoothness_Z_Diff_3" = parameters::smoothness(density_Z, method = "diff", lag = 3)
    )


    density_df <- as.data.frame(t(density_Z))
    names(density_df) <- paste0("Density_", 1:ncol(density_df))
    data <- cbind(data, density_df)

    if (length(unique(x)) == 1) {
      data$Distribution <- "uniform"
    } else {
      data$Distribution <- distribution
    }

    df <- rbind(df, data)
  }
  # write.csv(df, "classify_distribution.csv", row.names = FALSE)
}
# df <- read.csv("https://raw.github.com/easystats/circus/main/data/classify_distribution.csv")

Data Preparation

# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]

# Data partitioning


trainIndex <- caret::createDataPartition(as.factor(df$Distribution), p = 0.1, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]


# Parameters
fitControl <- caret::trainControl( ## 5-fold CV
  method = "repeatedcv",
  number = 5,
  ## repeated ten times
  repeats = 10,
  classProbs = TRUE,
  returnData = FALSE,
  trim = TRUE,
  allowParallel = TRUE
)
# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

Model Training

# Training
model_tree <- caret::train(Distribution ~ .,
  data = train,
  method = "rpart",
  trControl = fitControl
)
model_rf <- caret::train(Distribution ~ .,
  data = train,
  method = "rf",
  trControl = fitControl
)
model_nb <- caret::train(Distribution ~ .,
  data = train,
  method = "naive_bayes",
  trControl = fitControl
)

stopCluster(cluster) # explicitly shut down the cluster

Model Comparison

# collect resamples
results <- resamples(list(
  "DecisionTree" = model_tree,
  "RandomForest" = model_rf,
  "NaiveBayes" = model_nb
))

# summarize the distributions
summary(results)

# dot plots of results
dotplot(results)

# Sizes
data.frame(
  "DecisionTree" = as.numeric(object.size(model_tree)) / 1000,
  "RandomForest" = as.numeric(object.size(model_rf)) / 1000,
  "NaiveBayes" = as.numeric(object.size(model_nb)) / 1000
)

Best Model Inspection

model <- model_rf

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))

# Prediction Table
knitr::kable(confusion$table / colSums(confusion$table), digits = 2)

# Prediction Figure
perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x = Distribution, y = Metric, fill = Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept = 0.5), linetype = "dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Features
features <- caret::varImp(model, scale = TRUE)
plot(features)

# N trees
plot(model$finalModel, log = "x", main = "black default, red samplesize, green tree depth")

Random Forest Training with Feature Selection

Data Generation

df <- data.frame()
for (distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")) {
  for (i in 1:3000) {
    size <- round(runif(1, 100, 1000))
    location <- runif(1, 0.01, 10)
    scale <- runif(1, 0.02, 10)

    x <- generate_distribution(distribution, size = size, location = location, scale = scale)

    x[is.infinite(x)] <- 5.565423e+156
    x_scaled <- parameters::normalize(x, verbose = FALSE)

    # Extract features
    data <- data.frame(
      "Mean" = mean(x_scaled),
      "SD" = sd(x_scaled),
      "Median" = median(x_scaled),
      "MAD" = mad(x_scaled, constant = 1),
      "Mean_Median_Distance" = mean(x_scaled) - median(x_scaled),
      "Mean_Mode_Distance" = mean(x_scaled) - as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      "SD_MAD_Distance" = sd(x_scaled) - mad(x_scaled, constant = 1),
      "Mode" = as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      "Range_SD" = diff(range(x)) / sd(x),
      "Range_MAD" = diff(range(x)) / mad(x, constant = 1),
      "IQR" = stats::IQR(x_scaled),
      "Skewness" = skewness(x_scaled),
      "Kurtosis" = kurtosis(x_scaled),
      "Uniques" = length(unique(x)) / length(x),
      "CI_50_high" = ci(x_scaled, ci = 0.5)$CI_high,
      "CI_50_low" = ci(x_scaled, ci = 0.5)$CI_low,
      "CI_70_high" = ci(x_scaled, ci = 0.7)$CI_high,
      "CI_70_low" = ci(x_scaled, ci = 0.7)$CI_low,
      "CI_90_high" = ci(x_scaled, ci = 0.9)$CI_high,
      "CI_90_low" = ci(x_scaled, ci = 0.9)$CI_low
    )


    if (length(unique(x)) == 1) {
      data$Distribution <- "uniform"
    } else {
      data$Distribution <- distribution
    }
    df <- rbind(df, data)
  }
}

Preparation

# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]

# Data partitioning
trainIndex <- caret::createDataPartition(as.factor(df$Distribution), p = 0.1, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]

# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

Training

# Training
model <- randomForest::randomForest(as.factor(Distribution) ~ .,
  data = train,
  localImp = FALSE,
  importance = FALSE,
  keep.forest = TRUE,
  keep.inbag = FALSE,
  proximity = FALSE,
  maxDepth = 5,
  maxBins = 32,
  minInstancesPerNode = 1,
  minInfoGain = 0.0,
  maxMemoryInMB = 128,
  ntree = 10
)
stopCluster(cluster) # explicitly shut down the cluster

Model Inspecction

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))

# Prediction Table
knitr::kable(confusion$table / colSums(confusion$table), digits = 2)

# Prediction Figure
perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x = Distribution, y = Metric, fill = Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept = 0.5), linetype = "dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Features
caret::varImp(model, scale = TRUE)

Reduce Size and Save

# Initial size
as.numeric(object.size(model)) / 1000


# Reduce size
model <- strip(model, keep = "predict", use_trim = TRUE)

model$predicted <- NULL
model$y <- NULL
model$err.rate <- NULL
model$test <- NULL
model$proximity <- NULL
model$confusion <- NULL
model$localImportance <- NULL
model$importanceSD <- NULL
model$inbag <- NULL
model$votes <- NULL
model$oob.times <- NULL



as.numeric(object.size(model)) / 1000

# Test
is.factor(predict(model, df))
is.matrix(predict(model, data, type = "prob"))
# Save
# save(model, file="classify_distribution.Rda")

# Open this RMD within parameters project
# This stores this file as sysdata.Rda
classify_distribution <- model
usethis::use_data(classify_distribution, overwrite = TRUE, internal = TRUE)

References

knitr::kable(report::show_packages(sessionInfo()))


easystats/circus documentation built on March 26, 2024, 9:30 p.m.