knitr::opts_chunk$set(echo = TRUE)
library(SML)
library(ggplot2)

1. Example: Chinese restaurant process

We simulate 100 customers and set $\alpha = 4$.

n <- 100
K <- 4
CRP(n, K)

2. Example: Polya urn process.

Generate data and define base function.

G_0 <- function(){
  rnorm(1, mean = 0, sd = 1)
}
N <- 100
alpha <- 2

theta <- PolyaUrn(G_0, N, alpha)

Make plots.

par(mfrow = c(1, 2))
hist(theta, breaks = 20, xlim = c(-1, 1), col = "black")

plot(theta, seq(1, N),
     ylab = "Sample index", xlab = "Cluster by order of appearance",
     ylim = c(0,max(10, N)), xlim = c(-1, 1), pch = 19)

3. Example: stick breaking procss

alpha <- c(2, 5)
nn <- 20;

par(mfrow = c(length((alpha)), 2)) 

for(ii in 1:length(alpha)){
  for (trial in 1:2){
    beta <- rbeta(nn, 1, alpha[ii]);
    neg <- cumprod(1 - beta);
    pi <- beta * c(1, neg[1:(length(neg) - 1)]);
    barplot(pi, main = bquote(~alpha ~ "=" ~.(alpha[ii])), col = "black")
  }
}

4. Example: Gibbs sampling for the Gaussian Dirichlet process mixture model.

Marginal Distributions: * By construction, the marginal distribution over T is a gamma distribution. * The conditional distribution over X given T is a Gaussian distribution. * The marginal distribution over X is a three-parameter non-standardized Student's t-distribution.

Let's generate and plot some synthetic data and plot it.

  set.seed(1)
  x <- c(rnorm(100, 100, 8), rnorm(50, 200, 25), rnorm(50, 25, 1))
  labels <- c(rep("A", 100), rep("B", 50), rep("C", 50))

  df <- data.frame(X = x, Label = labels)

  ggplot(df, aes(x = X)) +
    geom_histogram(binwidth = 3)

  ggplot(df, aes(x = X, fill = Label)) +
    geom_histogram(binwidth = 3) +
    labs(legend.position = "none") +
    labs(title = "Ground Truth Mixture Model")
  #ggsave("dpmm_ground_truth.pdf", height = 7, width = 7)

Run DP by Gibbs.

nIter <- 100
results <- DpmmGibbs(x, 0.5, 0.1, 0.1, 0, 0.1, nIter)
results[, nIter]

ggplot(df, aes(x = X, fill = factor(results[, nIter]))) +
  geom_histogram(binwidth = 3) +
  labs(legend.position = "none") +
  labs(title = "dp-MM with alpha = 0.5")
#ggsave("dpmm_0.5.pdf", height = 7, width = 7)

See how the parameters interact with the clustering results.

nIter <- 100
results <- DpmmGibbs(x, 100.0, 0.1, 0.1, 0, 0.1, nIter)
results[, nIter]

ggplot(df, aes(x = X, fill = factor(results[, nIter]))) +
  geom_histogram(binwidth = 3) +
  labs(legend.position = "none") +
  labs(title = "dp-MM with alpha = 100.0")

Reference



jlin-vt/SML documentation built on Dec. 5, 2019, 2:05 a.m.