In easystats/circus: The Circus of Monsters

Introduction

The goal of this article is to create a machine learning model able to classify the distribution family of the data.

Methods

Parameters and Packages

# Plotting
library(ggplot2)

# Modelling
library(caret)
library(strip)
# library(parsnip)
# library(rsample)

# Optimizing
library(parallel)
library(doParallel)

# Analyzing
library(parameters)
library(bayestestR)
library(report)

set.seed(333)

Visualisation of distributions

distributions <- data.frame()
size <- 1000
for (location in round(seq(0.01, 10, length.out = 5), digits = 2)) {
  for (scale in round(seq(0.02, 10, length.out = 5), digits = 2)) {
    x <- parameters::normalize(as.data.frame(density(runif(size, location, location * 2), n = 100)))
    x$distribution <- "uniform"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rnorm(size, location, scale), n = 100)))
    x$distribution <- "normal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rbeta(size, location, scale), n = 100)))
    x$distribution <- "beta"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rbinom(size, round(location) + 1, scale / 10^(nchar(round(scale)))), n = 100)))
    x$distribution <- "binomial"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rchisq(size, location, scale), n = 100)))
    x$distribution <- "chi"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rexp(size, scale), n = 100)))
    x$distribution <- "exponential"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rf(size, location, scale), n = 100)))
    x$distribution <- "F"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rgamma(size, location, scale), n = 100)))
    x$distribution <- "gamma"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rlnorm(size, location, scale), n = 100)))
    x$distribution <- "lognormal"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rpois(size, location), n = 100)))
    x$distribution <- "poisson"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rt(size, location, scale), n = 100)))
    x$distribution <- "t"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)

    x <- parameters::normalize(as.data.frame(density(rweibull(size, location, scale), n = 100)))
    x$distribution <- "weibull"
    x$location <- location
    x$scale <- scale
    distributions <- rbind(distributions, x)
  }
}
ggplot(distributions, aes(x = x, y = y, colour = distribution)) +
  geom_line(size = 1) +
  facet_grid(location ~ scale) +
  theme_classic()

Convenience function

generate_distribution <- function(family = "normal", size = 1000, noise = 0, location = 0, scale = 1) {
  if (family == "normal") {
    x <- rnorm(size, location, scale)
  } else if (family == "uniform") {
    x <- runif(size, location, location * 2)
  } else if (family == "beta") {
    x <- rbeta(size, location, scale)
  } else if (family == "binomial") {
    x <- rbinom(size, round(location) + 1, scale / 10^(nchar(round(scale))))
  } else if (family == "chi") {
    x <- rchisq(size, location, scale)
  } else if (family == "exponential") {
    x <- rexp(size, scale)
  } else if (family == "F") {
    x <- rf(size, location, scale + 0.1)
  } else if (family == "gamma") {
    x <- rgamma(size, location, scale)
  } else if (family == "lognormal") {
    x <- rlnorm(size, location, scale)
  } else if (family == "poisson") {
    x <- rpois(size, location)
  } else if (family == "t") {
    x <- rt(size, location, scale)
  } else if (family == "weibull") {
    x <- rweibull(size, location, scale)
  }
  return(x)
}

Results

Data generation

df <- data.frame()
for (distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")) {
  for (i in 1:2000) {
    size <- round(runif(1, 10, 2000))
    location <- runif(1, 0.01, 10)
    scale <- runif(1, 0.02, 10)

    x <- generate_distribution(distribution, size = size, location = location, scale = scale)

    x[is.infinite(x)] <- 5.565423e+156
    x_scaled <- parameters::normalize(x, verbose = FALSE)

    density_Z <- density(x_scaled, n = 20)$y

    # Extract features
    data <- data.frame(
      "Mean" = mean(x_scaled),
      "SD" = sd(x_scaled),
      "Median" = median(x_scaled),
      "MAD" = mad(x_scaled, constant = 1),
      "Mean_Median_Distance" = mean(x_scaled) - median(x_scaled),
      "Mean_Mode_Distance" = mean(x_scaled) - as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      "SD_MAD_Distance" = sd(x_scaled) - mad(x_scaled, constant = 1),
      "Mode" = as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      # "Range" = range(x),
      "Range_SD" = diff(range(x)) / sd(x),
      "Range_MAD" = diff(range(x)) / mad(x, constant = 1),
      "IQR" = stats::IQR(x_scaled),
      "Skewness" = skewness(x_scaled),
      "Kurtosis" = kurtosis(x_scaled),
      "Uniques" = length(unique(x)) / length(x),
      "CI_50_high" = ci(x_scaled, ci = 0.5)$CI_high,
      "CI_50_low" = ci(x_scaled, ci = 0.5)$CI_low,
      "CI_70_high" = ci(x_scaled, ci = 0.7)$CI_high,
      "CI_70_low" = ci(x_scaled, ci = 0.7)$CI_low,
      "CI_90_high" = ci(x_scaled, ci = 0.9)$CI_high,
      "CI_90_low" = ci(x_scaled, ci = 0.9)$CI_low,
      "Smoothness_Cor" = parameters::smoothness(density(x_scaled)$y, method = "cor"),
      "Smoothness_Diff" = parameters::smoothness(density(x_scaled)$y, method = "diff"),
      "Smoothness_Z_Cor_1" = parameters::smoothness(density_Z, method = "cor", lag = 1),
      "Smoothness_Z_Diff_1" = parameters::smoothness(density_Z, method = "diff", lag = 1),
      "Smoothness_Z_Cor_3" = parameters::smoothness(density_Z, method = "cor", lag = 3),
      "Smoothness_Z_Diff_3" = parameters::smoothness(density_Z, method = "diff", lag = 3)
    )


    density_df <- as.data.frame(t(density_Z))
    names(density_df) <- paste0("Density_", 1:ncol(density_df))
    data <- cbind(data, density_df)

    if (length(unique(x)) == 1) {
      data$Distribution <- "uniform"
    } else {
      data$Distribution <- distribution
    }

    df <- rbind(df, data)
  }
  # write.csv(df, "classify_distribution.csv", row.names = FALSE)
}

# df <- read.csv("https://raw.github.com/easystats/circus/main/data/classify_distribution.csv")

Data Preparation

# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]

# Data partitioning


trainIndex <- caret::createDataPartition(as.factor(df$Distribution), p = 0.1, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]


# Parameters
fitControl <- caret::trainControl( ## 5-fold CV
  method = "repeatedcv",
  number = 5,
  ## repeated ten times
  repeats = 10,
  classProbs = TRUE,
  returnData = FALSE,
  trim = TRUE,
  allowParallel = TRUE
)
# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

Model Training

# Training
model_tree <- caret::train(Distribution ~ .,
  data = train,
  method = "rpart",
  trControl = fitControl
)
model_rf <- caret::train(Distribution ~ .,
  data = train,
  method = "rf",
  trControl = fitControl
)
model_nb <- caret::train(Distribution ~ .,
  data = train,
  method = "naive_bayes",
  trControl = fitControl
)

stopCluster(cluster) # explicitly shut down the cluster

Model Comparison

# collect resamples
results <- resamples(list(
  "DecisionTree" = model_tree,
  "RandomForest" = model_rf,
  "NaiveBayes" = model_nb
))

# summarize the distributions
summary(results)

# dot plots of results
dotplot(results)

# Sizes
data.frame(
  "DecisionTree" = as.numeric(object.size(model_tree)) / 1000,
  "RandomForest" = as.numeric(object.size(model_rf)) / 1000,
  "NaiveBayes" = as.numeric(object.size(model_nb)) / 1000
)

Best Model Inspection

model <- model_rf

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))

# Prediction Table
knitr::kable(confusion$table / colSums(confusion$table), digits = 2)

# Prediction Figure
perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x = Distribution, y = Metric, fill = Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept = 0.5), linetype = "dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Features
features <- caret::varImp(model, scale = TRUE)
plot(features)

# N trees
plot(model$finalModel, log = "x", main = "black default, red samplesize, green tree depth")

Random Forest Training with Feature Selection

Data Generation

df <- data.frame()
for (distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")) {
  for (i in 1:3000) {
    size <- round(runif(1, 100, 1000))
    location <- runif(1, 0.01, 10)
    scale <- runif(1, 0.02, 10)

    x <- generate_distribution(distribution, size = size, location = location, scale = scale)

    x[is.infinite(x)] <- 5.565423e+156
    x_scaled <- parameters::normalize(x, verbose = FALSE)

    # Extract features
    data <- data.frame(
      "Mean" = mean(x_scaled),
      "SD" = sd(x_scaled),
      "Median" = median(x_scaled),
      "MAD" = mad(x_scaled, constant = 1),
      "Mean_Median_Distance" = mean(x_scaled) - median(x_scaled),
      "Mean_Mode_Distance" = mean(x_scaled) - as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      "SD_MAD_Distance" = sd(x_scaled) - mad(x_scaled, constant = 1),
      "Mode" = as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")),
      "Range_SD" = diff(range(x)) / sd(x),
      "Range_MAD" = diff(range(x)) / mad(x, constant = 1),
      "IQR" = stats::IQR(x_scaled),
      "Skewness" = skewness(x_scaled),
      "Kurtosis" = kurtosis(x_scaled),
      "Uniques" = length(unique(x)) / length(x),
      "CI_50_high" = ci(x_scaled, ci = 0.5)$CI_high,
      "CI_50_low" = ci(x_scaled, ci = 0.5)$CI_low,
      "CI_70_high" = ci(x_scaled, ci = 0.7)$CI_high,
      "CI_70_low" = ci(x_scaled, ci = 0.7)$CI_low,
      "CI_90_high" = ci(x_scaled, ci = 0.9)$CI_high,
      "CI_90_low" = ci(x_scaled, ci = 0.9)$CI_low
    )


    if (length(unique(x)) == 1) {
      data$Distribution <- "uniform"
    } else {
      data$Distribution <- distribution
    }
    df <- rbind(df, data)
  }
}

Preparation

# Data clearning
df <- na.omit(df)
infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)]))
df <- df[!infinite, ]

# Data partitioning
trainIndex <- caret::createDataPartition(as.factor(df$Distribution), p = 0.1, list = FALSE)
train <- df[trainIndex, ]
test <- df[-trainIndex, ]

# Set up parallel
cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS
registerDoParallel(cluster)

Training

# Training
model <- randomForest::randomForest(as.factor(Distribution) ~ .,
  data = train,
  localImp = FALSE,
  importance = FALSE,
  keep.forest = TRUE,
  keep.inbag = FALSE,
  proximity = FALSE,
  maxDepth = 5,
  maxBins = 32,
  minInstancesPerNode = 1,
  minInfoGain = 0.0,
  maxMemoryInMB = 128,
  ntree = 10
)
stopCluster(cluster) # explicitly shut down the cluster

Model Inspecction

# Performance
test$pred <- predict(model, test)
confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall")
knitr::kable(data.frame("Performance" = confusion$overall))

# Prediction Table
knitr::kable(confusion$table / colSums(confusion$table), digits = 2)

# Prediction Figure
perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")]
perf$Distribution <- gsub("Class: ", "", row.names(perf))
perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric")
perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity")
ggplot(perf, aes(x = Distribution, y = Metric, fill = Type)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
  geom_hline(aes(yintercept = 0.5), linetype = "dotted") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Features
caret::varImp(model, scale = TRUE)

Reduce Size and Save

# Initial size
as.numeric(object.size(model)) / 1000


# Reduce size
model <- strip(model, keep = "predict", use_trim = TRUE)

model$predicted <- NULL
model$y <- NULL
model$err.rate <- NULL
model$test <- NULL
model$proximity <- NULL
model$confusion <- NULL
model$localImportance <- NULL
model$importanceSD <- NULL
model$inbag <- NULL
model$votes <- NULL
model$oob.times <- NULL



as.numeric(object.size(model)) / 1000

# Test
is.factor(predict(model, df))
is.matrix(predict(model, data, type = "prob"))

# Save
# save(model, file="classify_distribution.Rda")

# Open this RMD within parameters project
# This stores this file as sysdata.Rda
classify_distribution <- model
usethis::use_data(classify_distribution, overwrite = TRUE, internal = TRUE)

References

knitr::kable(report::show_packages(sessionInfo()))

easystats/circus documentation built on March 26, 2024, 9:30 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

easystats/circus
The Circus of Monsters

In easystats/circus: The Circus of Monsters

Introduction

Methods

Parameters and Packages

Visualisation of distributions

Convenience function

Results

Data generation

Data Preparation

Model Training

Model Comparison

Best Model Inspection

Random Forest Training with Feature Selection

Data Generation

Preparation

Training

Model Inspecction

Reduce Size and Save

References

R Package Documentation

Browse R Packages

We want your feedback!

easystats/circus The Circus of Monsters

In easystats/circus: The Circus of Monsters

Introduction

Methods

Parameters and Packages

Visualisation of distributions

Convenience function

Results

Data generation

Data Preparation

Model Training

Model Comparison

Best Model Inspection

Random Forest Training with Feature Selection

Data Generation

Preparation

Training

Model Inspecction

Reduce Size and Save

References

R Package Documentation

Browse R Packages

We want your feedback!

easystats/circus
The Circus of Monsters