knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "man/figures/README-", out.width = "100%" )
The goal of clust431 is to implement EM clustering on a dataset
You can install the released version of clust431 from CRAN with:
You access the github repository through this link: https://github.com/Cal-Poly-Advanced-R/lab-8-expectation-maximization-shanealman
install.packages("clust431")
library(clust431) library(tidyverse) library(mvtnorm) library(tictoc)
em_clust = function(x, k){ priors = rep(1/k, k) x = as.matrix(x) if(ncol(x) > 1){ x = data.frame(x) means = sample_n(x, k) }else { means = sample(x, k)} x = as.matrix(x) cov.var = vector('list', length = k) for(i in 1:k){ if( ncol(x) > 1){cov.var[[i]] = cov(x) }else { cov.var[[i]] = sd(x)} } probs2 = 0 probs1 = 1 while(sum((probs2 - probs1)^2) > 0.00001){ probs1 = probs2 probs2 = NULL if(ncol(x) > 1){ for(i in 1:k){ initprob = dmvnorm(x, as.numeric(means[i,]), cov.var[[i]]) probs2 = cbind(probs2, initprob)} } else { for(i in 1:k){ initprob = dnorm(x, means[i], cov.var[[i]]) probs2 = cbind(probs2, initprob) } } totalpost = 0 for(i in 1:k){ tempP = priors[i]*probs2[,i] totalpost = totalpost + tempP} posts = NULL for(i in 1:k){ initpost = (priors[i]*probs2[,i])/totalpost posts = cbind(posts, initpost)} priors = apply(posts, 2, mean) means = NULL if(ncol(x) > 1){ for(i in 1:k){ tempM = colSums(posts[,i]*x)/sum(posts[,i]) means = rbind(means, tempM)} } else { for(i in 1:k){ tempM = sum(posts[,i]*x)/sum(posts[,i]) means = c(means, tempM) } } lambda = NULL for(i in 1:k){ templambda = posts[,i]/(length(posts[,i])*priors[i]) lambda = cbind(lambda, templambda)} cov.var = vector('list', length = k) if(ncol(x) > 1){ for(i in 1:k){ tempcov.var = 0 for(l in 1:nrow(posts)){ initcov.var = lambda[l,i] * ((x[l,] - means[i,]) %*% t(x[l,] - means[i,])) tempcov.var = tempcov.var + initcov.var} cov.var[[i]] = tempcov.var} } else { cov.var = vector('list', length = k) for(i in 1:k){ tempcov.var = 0 for(l in 1:nrow(posts)){ initcov.var = lambda[l,i] * ((x[l,] - means[i]) %*% t(x[l,] - means[i])) tempcov.var =tempcov.var + initcov.var} cov.var[[i]] = sqrt(tempcov.var) } } } clust = rep(NA, nrow(posts)) for(i in 1:nrow(posts)){clust[i] = which(posts[i,] == max(posts[i,]), arr.ind = T)} rownames(means) = NULL return(list('Clusters' = clust, 'Means' = means, 'Covariance' = cov.var)) }
dataframe = select(iris, Sepal.Length, Sepal.Width) em_clust(dataframe, 3) kmeans(dataframe, 3)
These results show that both EM and K-means are producing similar clustering means and clustering vectors; but, EM is also able to provide the covariance matrix for each cluster.
mtcars.cluster = em_clust(mtcars, 4) mtcars.cluster$Means mtcars.cluster$Clusters
dataframe2 = data.frame(x = c(rnorm(500, 20, 5), rnorm(500,1, 10)), y = c(rnorm(500, 40, 2),rnorm(500, 0, 9))) em_clust(dataframe2, 2) ggplot(dataframe2) + geom_point(aes(x, y))
em_clust(iris$Sepal.Length, 3) plot(iris$Sepal.Length, pch = 20, ylab = '')
tic() datacluster = em_clust(dataframe2, 5) datacluster$Means toc()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.