AdaptGauss: Gaussian Mixture Models (GMM)

Documented in EMGauss

EMGauss = function(Data, K=NULL, Means=NULL,SDs=NULL,Weights=NULL,MaxNumberofIterations=10, fast=F){
#em=EMGauss(Data,K,Means,SDs,Weights)
# EM- Algorithm to calculate optimal Gaussian Mixture Model for given data in one Dimension
# In particular, no adding or removing of Gaussian kernels
# NOTE: this is probably not numerical stable, log(gauss) is not used.
#
# DESCRIPTION
# 
# INPUT
# Data(1:AnzCases)          the data points as a vector
# K                         estimated number of Gaussian kernels
# Means(1:L)                estimated Gaussian Kernels = means(clustercenters),
#                           L ==  Number of Clustercenters; default=meanrobust(Data)
# SDs(1:L)                estimated Gaussian Kernels = standard deviations; default=stdrobust(Data)
# Weights(1:L)              relative number of points in Gaussians (prior probabilities): sum(Weights) ==1; default=1
# MaxNumberofIterations     Number of Iterations; default=10
# fast                      Default: FALSE: Using mclust EM, TRUE: Own faster but maybe numerical unstable implementation
# OUTPUT
# Means                    means of GMM generated by EM algorithm
# SDs                    standard deviations of GMM generated by EM algorithm
# Weights                 prior probabilities of Gaussians
#
# Author Hansen-Goos 2014
# 1.Editor: MT 08/2015 bugfixes, new EM
# 2.Editor: FL 10/2016 Parameter k hinzugefuegt
  
#MT: Funktionen nach vorne verschoben
# meanrobust
#################################################
`meanrobust` <- function(x, p=0.1){
  
  if(is.matrix(x)){
    mhat<-c()
    for(i in 1:dim(x)[2]){
      mhat[i]<-mean(x[,i],trim=p,na.rm=TRUE)
    }
  } else  mhat<-mean(x,trim=p,na.rm=TRUE) 
  
  return (mhat) 
  
}
#################################################

# stdrobust
#################################################
`stdrobust` <- function(x,lowInnerPercentile=25){
  #MT: nach vorne verschoben, damit es keinen Bug gibt
    prctile<-function(x,p){
    #   matlab:
    #   Y = prctile(X,p) returns percentiles of the values in X. 
    #   p is a scalar or a vector of percent values. When X is a 
    #   vector, Y is the same size as p and Y(i) contains the p(i)th 
    #   percentile. When X is a matrix, the ith row of Y contains the 
    #   p(i)th percentiles of each column of X. For N-dimensional arrays,
    #   prctile operates along the first nonsingleton dimension of X.  
    if(length(p)==1){  
      if(p>1){p=p/100}
      
    }
    
    if(is.matrix(x) && ncol(x)>1){
      cols<-ncol(x)
      quants<-matrix(0,nrow=length(p),ncol=cols)
      for(i in 1:cols){
        quants[,i]<-quantile(x[,i],probs=p,type=5,na.rm=TRUE)
      }
    }else{
      quants<-quantile(x,p,type=5,na.rm=TRUE)
    }
    return(quants)
  }
  
  if(is.vector(x) || (is.matrix(x) && dim(x)[1]==1)) dim(x)<-c(length(x),1)
  
  lowInnerPercentile<-max(1,min(lowInnerPercentile,49))
  hiInnerPercentile<- 100 - lowInnerPercentile
  faktor<-sum(abs(qnorm(t(c(lowInnerPercentile,hiInnerPercentile)/100),0,1)))
  std<-sd(x,na.rm=TRUE)
  p<-c(lowInnerPercentile,hiInnerPercentile)/100
  quartile<-prctile(x,p)  
  if (ncol(x)>1)
    iqr<-quartile[2,]-quartile[1,]
  else
    iqr<-quartile[2]-quartile[1]
  
  shat<-c()
  for(i in 1:ncol(x)){
    shat[i]<-min(std[i],iqr[i]/faktor,na.rm=TRUE)
  }
  dim(shat)<-c(1,ncol(x))
  colnames(shat)<-colnames(x)
  return (shat) 
  
}
#################################################

# normpdf
#################################################
`normpdf` <- function(X,Mean,Sdev){
  # computes the normal pdf at each of the values in X 
  # R funktion fuer PDF = normpdf(X,Mean,Sdev)
  return(dnorm(X,Mean,Sdev))
} # end function normpdf
#################################################

#MT/FL: Fehlerabfang
if(!is.null(K)){
  if(!is.null(Means)){
    if(K != length(Means)) stop("K should be equal to length(Means)")
  }else{
    if(K == 1) Means = meanrobust(Data)
    else Means = sample(Data, K)
  }
  if(!is.null(SDs)){
    if(K != length(SDs)) stop("k should be equal to length(SDs)")
  }else{
    if(K == 1) SDs = stdrobust(Data)
    else SDs = runif(K,0,sd(Data))
  }
  if(is.null(Weights)) Weights = rep(1/K, K)
}else{
  if(!is.null(Means)) K = length(Means)
  else if(!is.null(SDs)) K = length(SDs)
  else if(!is.null(Weights)) K = length(Weights)
  else K = 1
  
  if(is.null(Means)) Means=meanrobust(Data)
  if(is.null(SDs)) SDs=stdrobust(Data)
  if(is.null(Weights)) Weights=rep(1/K,K)
}





# how many Clustercenters
L <- length(Means) 
Ls <- length(SDs) 
La <- length(Weights)

if (!((L == Ls ) && (L== La))){
  warning('EMgauss: Number of means not equal to number of sdevs or Weights')
}
if(fast){

  if (!(abs(sum(Weights)-1) <0.001)){
  sumWeight <- sum(Weights)
  for (i in 1:length(Weights)){
    Weights[i] <- Weights[i]/sumWeight
  }
  
  }
  
  AnzCases <- length(Data)
  EMmean <- Means 
  EMsdev <- SDs 
  EMweights <- Weights
  
  
  for (t in 1:MaxNumberofIterations){ # update EM Parameters
    #1.) compute actual alpha weighted probability for each point
    Na <- matrix(0,AnzCases,L)
    for (i in 1:L) {# compute actual Distributions
      Na[1:AnzCases,i] <- normpdf(Data,EMmean[i],EMsdev[i])* EMweights[i]
    }   # for (i in 1:L) # compute actual Distributions
    #2.) compute total probability for each point
    TotalEMGauss <- seq(0,0,length.out=AnzCases)
    for (j in 1:AnzCases){
      TotalEMGauss[j] <- sum(Na[j,1:L])
    }   
    #3.) compute point's weight as 1.) divided by 2.)
    w <- matrix(0,AnzCases,L)
    for (j in 1:L){
      w[1:AnzCases,j] <- Na[1:AnzCases,j]/TotalEMGauss
    }
    #4.) update Weightstrich AND weights
    Weightstrich  <-  seq(0,0,length.out=L)
    for (j in 1:L){
      Weightstrich[j]  <-  sum(w[1:AnzCases,j]) # = alpha * AnzCases
      if (is.nan(Weightstrich[j]) || is.na(Weightstrich[j])){
        Weightstrich[j]  <-  0
      }
    }
    for (j in 1:L){
      if (Weightstrich[j]<0.01){
        Weightstrich[j] <- 0.009
      }
    }
    EMweights  <-  Weightstrich/AnzCases
   
    ZeroInd <- seq(0,0,length.out=L)
    for (j in 1:L){
      if (EMweights[j]<0.01){
        ZeroInd[j] <- 1
        EMweights[j] <- 0
      }
    }
    if (sum(EMweights) <0.01){ # no right solution found: give up search & return robust values
      EMweights  <-  seq(0,0,length.out=L)
      EMmean <- EMweights
      EMsdev <- EMweights
      EMweights[1]  <- 1
      EMmean[1]  <-  meanrobust(Data)
      EMsdev[1]  <-  stdrobust(Data)
      # output <- list(mean=EMmean,sdev=EMsdev,weight=EMweights)
      output <- list(Means=EMmean,SDs=EMsdev,Weights=EMweights)         # edited: OHG 05.2016
      warning("EM: no right solution found --> give up search & return robust values")
      return(output)
      break
    }
    #5.) update means
    for (j in 1:L){
      EMmean[j] <- sum(w[1:AnzCases,j]*Data)/Weightstrich[j]
    }
    # account for weight ==0
    for (j in 1:L){
      if (ZeroInd[j]){
        EMmean[j] <- 0
      }
    }
    #6. update std devs
    for (j in 1:L){
      EMsdev[j]  <-  sqrt(sum(w[1:AnzCases,j]*(Data-seq(1,1,length.out=AnzCases)*EMmean[j])^2)/Weightstrich[j])
    }
    # account for weight ==0
    for (j in 1:L){
      if (ZeroInd[j]){
        EMsdev[j] <- 0
      }
    } 
  }# for t=1:MaxNumberofIterations # update EM Parameters
  
  #output <- list(mean=EMmean,sdev=EMsdev,weight=EMweights)
  output <- list(Means=EMmean,SDs=EMsdev,Weights=EMweights)

}else{ #fast=f
#requireRpackage('mclust')
#iterations???
requireNamespace('mclust')
em=mclust::densityMclust(Data,G=L, modelName='V', parameters=list(pro=Weights,mean=Means,variance=sqrt(SDs),control=list(itmax=c(MaxNumberofIterations,MaxNumberofIterations))))
res=em$parameters

# sigmasq entspricht hier sigma^2 = sd^2 => fuer SD nochmal Wurzel ziehen
output <- list(Means=unname(res$mean),SDs=sqrt(unname(res$variance$sigmasq)),Weights=unname(res$pro))
#output <- list(mean=unname(res$mean),sdev=unname(res$variance$sigmasq),weight=unname(res$pro))
return(output)
} # end if fast
} # eof