The goal of this article is to create a machine learning model able to classify the distribution family of the data.
# Plotting library(ggplot2) # Modelling library(caret) library(strip) # library(parsnip) # library(rsample) # Optimizing library(parallel) library(doParallel) # Analyzing library(parameters) library(bayestestR) library(report) set.seed(333)
distributions <- data.frame() size <- 1000 for (location in round(seq(0.01, 10, length.out = 5), digits = 2)) { for (scale in round(seq(0.02, 10, length.out = 5), digits = 2)) { x <- parameters::normalize(as.data.frame(density(runif(size, location, location * 2), n = 100))) x$distribution <- "uniform" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rnorm(size, location, scale), n = 100))) x$distribution <- "normal" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rbeta(size, location, scale), n = 100))) x$distribution <- "beta" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rbinom(size, round(location) + 1, scale / 10^(nchar(round(scale)))), n = 100))) x$distribution <- "binomial" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rchisq(size, location, scale), n = 100))) x$distribution <- "chi" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rexp(size, scale), n = 100))) x$distribution <- "exponential" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rf(size, location, scale), n = 100))) x$distribution <- "F" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rgamma(size, location, scale), n = 100))) x$distribution <- "gamma" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rlnorm(size, location, scale), n = 100))) x$distribution <- "lognormal" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rpois(size, location), n = 100))) x$distribution <- "poisson" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rt(size, location, scale), n = 100))) x$distribution <- "t" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) x <- parameters::normalize(as.data.frame(density(rweibull(size, location, scale), n = 100))) x$distribution <- "weibull" x$location <- location x$scale <- scale distributions <- rbind(distributions, x) } } ggplot(distributions, aes(x = x, y = y, colour = distribution)) + geom_line(size = 1) + facet_grid(location ~ scale) + theme_classic()
generate_distribution <- function(family = "normal", size = 1000, noise = 0, location = 0, scale = 1) { if (family == "normal") { x <- rnorm(size, location, scale) } else if (family == "uniform") { x <- runif(size, location, location * 2) } else if (family == "beta") { x <- rbeta(size, location, scale) } else if (family == "binomial") { x <- rbinom(size, round(location) + 1, scale / 10^(nchar(round(scale)))) } else if (family == "chi") { x <- rchisq(size, location, scale) } else if (family == "exponential") { x <- rexp(size, scale) } else if (family == "F") { x <- rf(size, location, scale + 0.1) } else if (family == "gamma") { x <- rgamma(size, location, scale) } else if (family == "lognormal") { x <- rlnorm(size, location, scale) } else if (family == "poisson") { x <- rpois(size, location) } else if (family == "t") { x <- rt(size, location, scale) } else if (family == "weibull") { x <- rweibull(size, location, scale) } return(x) }
df <- data.frame() for (distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")) { for (i in 1:2000) { size <- round(runif(1, 10, 2000)) location <- runif(1, 0.01, 10) scale <- runif(1, 0.02, 10) x <- generate_distribution(distribution, size = size, location = location, scale = scale) x[is.infinite(x)] <- 5.565423e+156 x_scaled <- parameters::normalize(x, verbose = FALSE) density_Z <- density(x_scaled, n = 20)$y # Extract features data <- data.frame( "Mean" = mean(x_scaled), "SD" = sd(x_scaled), "Median" = median(x_scaled), "MAD" = mad(x_scaled, constant = 1), "Mean_Median_Distance" = mean(x_scaled) - median(x_scaled), "Mean_Mode_Distance" = mean(x_scaled) - as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")), "SD_MAD_Distance" = sd(x_scaled) - mad(x_scaled, constant = 1), "Mode" = as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")), # "Range" = range(x), "Range_SD" = diff(range(x)) / sd(x), "Range_MAD" = diff(range(x)) / mad(x, constant = 1), "IQR" = stats::IQR(x_scaled), "Skewness" = skewness(x_scaled), "Kurtosis" = kurtosis(x_scaled), "Uniques" = length(unique(x)) / length(x), "CI_50_high" = ci(x_scaled, ci = 0.5)$CI_high, "CI_50_low" = ci(x_scaled, ci = 0.5)$CI_low, "CI_70_high" = ci(x_scaled, ci = 0.7)$CI_high, "CI_70_low" = ci(x_scaled, ci = 0.7)$CI_low, "CI_90_high" = ci(x_scaled, ci = 0.9)$CI_high, "CI_90_low" = ci(x_scaled, ci = 0.9)$CI_low, "Smoothness_Cor" = parameters::smoothness(density(x_scaled)$y, method = "cor"), "Smoothness_Diff" = parameters::smoothness(density(x_scaled)$y, method = "diff"), "Smoothness_Z_Cor_1" = parameters::smoothness(density_Z, method = "cor", lag = 1), "Smoothness_Z_Diff_1" = parameters::smoothness(density_Z, method = "diff", lag = 1), "Smoothness_Z_Cor_3" = parameters::smoothness(density_Z, method = "cor", lag = 3), "Smoothness_Z_Diff_3" = parameters::smoothness(density_Z, method = "diff", lag = 3) ) density_df <- as.data.frame(t(density_Z)) names(density_df) <- paste0("Density_", 1:ncol(density_df)) data <- cbind(data, density_df) if (length(unique(x)) == 1) { data$Distribution <- "uniform" } else { data$Distribution <- distribution } df <- rbind(df, data) } # write.csv(df, "classify_distribution.csv", row.names = FALSE) }
# df <- read.csv("https://raw.github.com/easystats/circus/main/data/classify_distribution.csv")
# Data clearning df <- na.omit(df) infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)])) df <- df[!infinite, ] # Data partitioning trainIndex <- caret::createDataPartition(as.factor(df$Distribution), p = 0.1, list = FALSE) train <- df[trainIndex, ] test <- df[-trainIndex, ] # Parameters fitControl <- caret::trainControl( ## 5-fold CV method = "repeatedcv", number = 5, ## repeated ten times repeats = 10, classProbs = TRUE, returnData = FALSE, trim = TRUE, allowParallel = TRUE ) # Set up parallel cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS registerDoParallel(cluster)
# Training model_tree <- caret::train(Distribution ~ ., data = train, method = "rpart", trControl = fitControl ) model_rf <- caret::train(Distribution ~ ., data = train, method = "rf", trControl = fitControl ) model_nb <- caret::train(Distribution ~ ., data = train, method = "naive_bayes", trControl = fitControl ) stopCluster(cluster) # explicitly shut down the cluster
# collect resamples results <- resamples(list( "DecisionTree" = model_tree, "RandomForest" = model_rf, "NaiveBayes" = model_nb )) # summarize the distributions summary(results) # dot plots of results dotplot(results) # Sizes data.frame( "DecisionTree" = as.numeric(object.size(model_tree)) / 1000, "RandomForest" = as.numeric(object.size(model_rf)) / 1000, "NaiveBayes" = as.numeric(object.size(model_nb)) / 1000 )
model <- model_rf # Performance test$pred <- predict(model, test) confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall") knitr::kable(data.frame("Performance" = confusion$overall)) # Prediction Table knitr::kable(confusion$table / colSums(confusion$table), digits = 2) # Prediction Figure perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")] perf$Distribution <- gsub("Class: ", "", row.names(perf)) perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric") perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity") ggplot(perf, aes(x = Distribution, y = Metric, fill = Type)) + geom_bar(stat = "identity", position = position_dodge(width = 0.9)) + geom_hline(aes(yintercept = 0.5), linetype = "dotted") + theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Features features <- caret::varImp(model, scale = TRUE) plot(features) # N trees plot(model$finalModel, log = "x", main = "black default, red samplesize, green tree depth")
df <- data.frame() for (distribution in c("normal", "beta", "binomial", "chi", "exponential", "F", "gamma", "lognormal", "poisson", "t", "uniform", "weibull")) { for (i in 1:3000) { size <- round(runif(1, 100, 1000)) location <- runif(1, 0.01, 10) scale <- runif(1, 0.02, 10) x <- generate_distribution(distribution, size = size, location = location, scale = scale) x[is.infinite(x)] <- 5.565423e+156 x_scaled <- parameters::normalize(x, verbose = FALSE) # Extract features data <- data.frame( "Mean" = mean(x_scaled), "SD" = sd(x_scaled), "Median" = median(x_scaled), "MAD" = mad(x_scaled, constant = 1), "Mean_Median_Distance" = mean(x_scaled) - median(x_scaled), "Mean_Mode_Distance" = mean(x_scaled) - as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")), "SD_MAD_Distance" = sd(x_scaled) - mad(x_scaled, constant = 1), "Mode" = as.numeric(bayestestR::map_estimate(x_scaled, bw = "nrd0")), "Range_SD" = diff(range(x)) / sd(x), "Range_MAD" = diff(range(x)) / mad(x, constant = 1), "IQR" = stats::IQR(x_scaled), "Skewness" = skewness(x_scaled), "Kurtosis" = kurtosis(x_scaled), "Uniques" = length(unique(x)) / length(x), "CI_50_high" = ci(x_scaled, ci = 0.5)$CI_high, "CI_50_low" = ci(x_scaled, ci = 0.5)$CI_low, "CI_70_high" = ci(x_scaled, ci = 0.7)$CI_high, "CI_70_low" = ci(x_scaled, ci = 0.7)$CI_low, "CI_90_high" = ci(x_scaled, ci = 0.9)$CI_high, "CI_90_low" = ci(x_scaled, ci = 0.9)$CI_low ) if (length(unique(x)) == 1) { data$Distribution <- "uniform" } else { data$Distribution <- distribution } df <- rbind(df, data) } }
# Data clearning df <- na.omit(df) infinite <- is.infinite(rowSums(df[sapply(df, is.numeric)])) df <- df[!infinite, ] # Data partitioning trainIndex <- caret::createDataPartition(as.factor(df$Distribution), p = 0.1, list = FALSE) train <- df[trainIndex, ] test <- df[-trainIndex, ] # Set up parallel cluster <- makeCluster(detectCores() - 1) # convention to leave 1 core for OS registerDoParallel(cluster)
# Training model <- randomForest::randomForest(as.factor(Distribution) ~ ., data = train, localImp = FALSE, importance = FALSE, keep.forest = TRUE, keep.inbag = FALSE, proximity = FALSE, maxDepth = 5, maxBins = 32, minInstancesPerNode = 1, minInfoGain = 0.0, maxMemoryInMB = 128, ntree = 10 ) stopCluster(cluster) # explicitly shut down the cluster
# Performance test$pred <- predict(model, test) confusion <- confusionMatrix(data = test$pred, reference = as.factor(test$Distribution), mode = "prec_recall") knitr::kable(data.frame("Performance" = confusion$overall)) # Prediction Table knitr::kable(confusion$table / colSums(confusion$table), digits = 2) # Prediction Figure perf <- as.data.frame(confusion$byClass)[c("Sensitivity", "Specificity")] perf$Distribution <- gsub("Class: ", "", row.names(perf)) perf <- reshape(perf, varying = list(c("Sensitivity", "Specificity")), timevar = "Type", idvar = "Distribution", direction = "long", v.names = "Metric") perf$Type <- ifelse(perf$Type == 1, "Sensitivity", "Specificity") ggplot(perf, aes(x = Distribution, y = Metric, fill = Type)) + geom_bar(stat = "identity", position = position_dodge(width = 0.9)) + geom_hline(aes(yintercept = 0.5), linetype = "dotted") + theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Features caret::varImp(model, scale = TRUE)
# Initial size as.numeric(object.size(model)) / 1000 # Reduce size model <- strip(model, keep = "predict", use_trim = TRUE) model$predicted <- NULL model$y <- NULL model$err.rate <- NULL model$test <- NULL model$proximity <- NULL model$confusion <- NULL model$localImportance <- NULL model$importanceSD <- NULL model$inbag <- NULL model$votes <- NULL model$oob.times <- NULL as.numeric(object.size(model)) / 1000 # Test is.factor(predict(model, df)) is.matrix(predict(model, data, type = "prob"))
# Save # save(model, file="classify_distribution.Rda") # Open this RMD within parameters project # This stores this file as sysdata.Rda classify_distribution <- model usethis::use_data(classify_distribution, overwrite = TRUE, internal = TRUE)
knitr::kable(report::show_packages(sessionInfo()))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.