knitr::opts_chunk$set(echo = TRUE) library(SML) library(ggplot2)
We simulate 100 customers and set $\alpha = 4$.
n <- 100 K <- 4 CRP(n, K)
Generate data and define base function.
G_0 <- function(){ rnorm(1, mean = 0, sd = 1) } N <- 100 alpha <- 2 theta <- PolyaUrn(G_0, N, alpha)
Make plots.
par(mfrow = c(1, 2)) hist(theta, breaks = 20, xlim = c(-1, 1), col = "black") plot(theta, seq(1, N), ylab = "Sample index", xlab = "Cluster by order of appearance", ylim = c(0,max(10, N)), xlim = c(-1, 1), pch = 19)
alpha <- c(2, 5) nn <- 20; par(mfrow = c(length((alpha)), 2)) for(ii in 1:length(alpha)){ for (trial in 1:2){ beta <- rbeta(nn, 1, alpha[ii]); neg <- cumprod(1 - beta); pi <- beta * c(1, neg[1:(length(neg) - 1)]); barplot(pi, main = bquote(~alpha ~ "=" ~.(alpha[ii])), col = "black") } }
Marginal Distributions: * By construction, the marginal distribution over T is a gamma distribution. * The conditional distribution over X given T is a Gaussian distribution. * The marginal distribution over X is a three-parameter non-standardized Student's t-distribution.
Let's generate and plot some synthetic data and plot it.
set.seed(1) x <- c(rnorm(100, 100, 8), rnorm(50, 200, 25), rnorm(50, 25, 1)) labels <- c(rep("A", 100), rep("B", 50), rep("C", 50)) df <- data.frame(X = x, Label = labels) ggplot(df, aes(x = X)) + geom_histogram(binwidth = 3) ggplot(df, aes(x = X, fill = Label)) + geom_histogram(binwidth = 3) + labs(legend.position = "none") + labs(title = "Ground Truth Mixture Model") #ggsave("dpmm_ground_truth.pdf", height = 7, width = 7)
Run DP by Gibbs.
nIter <- 100 results <- DpmmGibbs(x, 0.5, 0.1, 0.1, 0, 0.1, nIter) results[, nIter] ggplot(df, aes(x = X, fill = factor(results[, nIter]))) + geom_histogram(binwidth = 3) + labs(legend.position = "none") + labs(title = "dp-MM with alpha = 0.5") #ggsave("dpmm_0.5.pdf", height = 7, width = 7)
See how the parameters interact with the clustering results.
nIter <- 100 results <- DpmmGibbs(x, 100.0, 0.1, 0.1, 0, 0.1, nIter) results[, nIter] ggplot(df, aes(x = X, fill = factor(results[, nIter]))) + geom_histogram(binwidth = 3) + labs(legend.position = "none") + labs(title = "dp-MM with alpha = 100.0")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.