knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 5, 
  fig.height = 4
)

Introduction

tagmEM is a function which employs generalised t-augmented Gaussian mixture modelling to identify sets of observations which cluster (i.e. have common mean and variance components) using an expectation maximisation algorithm. Currently the software can accommodate univariate data only. In a future release this will be upgraded to allow for multi-variate data to be analysed. Below we give a quick guide to installation and an example of how to use the function - including a comparison with the very popular Gaussian mixture modelling package 'Mclust'.

Installation

options(warn = -1)
install.packages("devtools", repos='http://cran.us.r-project.org')
library(devtools)
install_github("cnfoley/tagmEM", build_vignettes = TRUE)

Using tagmEM

library(tagmEM);
library(mclust);

Example with 10 "junk" outlier observations and 3 clusters

set.seed(314);
cluster_sizes = c(rep(1,10),25,35,30);
N = sum(cluster_sizes);
K = length(cluster_sizes);
centres = c(runif(10,-1,1), runif(K-10,-10,10));
std_devs = c(rep(5,10),sqrt(runif(K-1,0.1,1)));
data = NULL;
truth = NULL;
count = 1;
  for(i in cluster_sizes){
      data = c(data, rnorm(i, mean = centres[count], sd = std_devs[count]));
      truth = c(truth, rep(count,i));
      count = count + 1;
  }

# cluster centres
table(cluster_size = cluster_sizes, cluster_centre = round(centres,3));

tagmEM results

rng = range(data);
sig = sd(data) + sqrt(rng[2]-rng[1]);
res_tagm = tagmEM(data, junk_mixture = TRUE, df = 4, junk_mean = mean(data), junk_sd = sig);
#
# Summarise number of observations in each cluster
table(res_tagm$results$best$cluster_class);
#
# Summarise cluster centres:
table(res_tagm$results$best$cluster_mean);
#
# Assess quality of identified clusters against ground truth using adjusted Rand index
## All obs
adjustedRandIndex(x = res_tagm$results$best$cluster, y = truth);
## Remove junk
jnk_obs = which(res_tagm$results$best$cluster_class=="Junk");
adjustedRandIndex(x = res_tagm$results$best$cluster[-jnk_obs], y = truth[-jnk_obs]);

Mclust results

res_mclust = mclust::Mclust(data);
#
# Summarise number of observations in each cluster
table(res_mclust$classification);
#
# Summarise cluster centres:
mn  = round(res_mclust$parameters$mean,3);
table(mn[res_mclust$classification]);
# Assess quality of identified clusters against ground truth using adjusted Rand index
adjustedRandIndex(x = res_mclust$classification, y = truth);


cnfoley/tagmEM documentation built on Feb. 1, 2021, 12:23 a.m.