R/pkbc.R

Defines functions pkbc norm_vec

Documented in pkbc

#################################################
# R code for initial experiments for mixture of poisson kernel based distributions
# for clustering on a hyper-sphere
# Author: Arti Shivram, Alex Foss
# Modified: 6/16/2016 AS
# Subsequent changes by AF documented in github
#################################################

## Not all are necessary
#library("slam")
#library("lattice")
#library("RColorBrewer")
#library("NLP")
#library("latticeExtra")
#library("movMF")
#library("reshape")
#library("tm")
#library("clue")
#library("skmeans")

#######################################################
# A few functions
#########################################################
#
#----------------------------------------------------------#
# faster to use t(x) %*% x notation?
norm_vec <- function(x) sqrt(sum(x^2))
#----------------------------------------------------------#

# inputs to the function are the (i) data (ii) numClust - number of clusters

#' Poisson kernel-based clustering of spherical data.
#'
#' Poisson kernel-based clustering of spherical data.
#'
#' The parameter \code{stoppingRule} determines how iterations are terminated
#' within each run. Value 'max' simply runs to completion until \code{maxIter}
#' iterations have completed; the value 'membership' runs until
#' \code{maxIter} iterations have completed or group membership remains
#' unchanged from one iteration to the next, whichever comes first; and the
#' value 'loglik' runs until the log-likelihood remains unchanged from one
#' iteration to the next or \code{maxIter} iterations have completed, whichever
#' comes first.
#'
#' The parameter \code{initMethod} determines how initialization of the means
#' in each EM run is accomplished. Option 'sampleData' samples unique points
#' from the input data set without replacement to initalize means (the only
#' option currently).
#' 
#' @export
#' @param dat The matrix of data to be clustered.
#' @param numClust The number of clusters.
#' @param maxIter The maximum number of iterations before a run is terminated.
#' @param stoppingRule A string describing the stopping rule to be
#'   used within each run. Currently must be either \code{'max'},
#'   \code{'membership'}, or \code{'loglik'}.
#' @param initMethod A string describing the initialization method to be used.
#'   Currently must be \code{'sampleData'}.
#' @param numInit The number of initializations.
pkbc <- function(
  dat,
  numClust,
  maxIter = 300,
  stoppingRule = 'loglik',
  initMethod = 'sampleData',
  numInit = 10
  ) {

  # Constant defining threshold by which log likelihood must change to continue
  # iterations, if applicable.
  LL_TOL <- 1e-7

  # validate input
  if (numClust < 1) {
    stop('Input parameter numClust must be greater than 0')
  }
  if (maxIter < 1) {
    stop('Input parameter maxIter must be greater than 0')
  }
  if ( !(stoppingRule %in% c('max', 'membership', 'loglik')) ) {
    stop(paste('Unrecognized value "', stoppingRule, '" in input parameter
      stoppingRule.', sep=''))
  }
  if ( !(initMethod %in% c('sampleData')) ) {
    stop(paste('Unrecognized value "', initMethod, '" in input parameter
      initMethod.', sep=''))
  }
  if (numInit < 1) {
    stop('Input parameter numInit must be greater than 0')
  }

  # set options for stopping rule
  checkMembership <- stoppingRule == 'membership'
  checkLoglik <- stoppingRule == 'loglik'

  # Normalize the data
  dat <- dat/sqrt(rowSums(dat^2))
  
  # Initialize variables of interest
  numVar <- ncol(dat)
  numData <- nrow(dat)
  logLikVec <- rep(-Inf, numInit)
  numIterPerRun <- rep(-99, numInit)
  alpha_best <- rep(-99, numClust)
  rho_best <- rep(-99, numClust)
  mu_best <- matrix(nrow = numClust, ncol = numVar)
  normprobMat_best <- matrix(-99,nrow=numData, ncol=numClust)
  if (initMethod == 'sampleData') {
    uniqueData <- unique(dat)
    numUniqueObs <- nrow(uniqueData)
    if (numUniqueObs < numClust) {
      stop(paste('Only', numUniqueObs, 'unique observations.',
        'When initMethod = "sampleData", must have more than numClust unique observations.'
      ))
    }
  }
    
  log_w_d <- (numVar / 2) * (log(2) + log(pi)) - lgamma(numVar / 2)

  # Begin initializations
  for (init in 1:numInit) {

    # initialize parameters
    alpha_current <- rep(1/numClust, numClust)
    rho_current <- rep(0.5,numClust)
    
    # Initializing mu
    if (initMethod == 'sampleData') {
      # sampling w/o replacement from unique data points
      mu_current <- uniqueData[sample(numUniqueObs, size=numClust, replace=FALSE) ,]
    }
    
    currentIter <- 1
    membCurrent <- rep.int(0, times = numData)
    loglikCurrent <- -Inf
 
    # Begin EM iterations
    while (currentIter <= maxIter) {
      v_mat <- dat %*% t(mu_current)
      alpha_mat_current <- matrix(
        alpha_current,
	nrow = numData,
	ncol = numClust,
	byrow = TRUE
      )
      rho_mat_current <- matrix(
        rho_current,
	nrow = numData,
	ncol = numClust,
	byrow = TRUE
      )
      log_probMat_denom <- log(1 + rho_mat_current^2 - 2*rho_mat_current*v_mat)
      # eq (11) in ICMLD16 paper
      log_probMat <- log(1 - (rho_mat_current^2)) - log_w_d - (numVar / 2) * log_probMat_denom
      ######### E step done#############################################
      ########## M step now#############################################
      # Denominator of eq (18) in ICMLD16 paper
      probSum <- matrix(
        exp(log_probMat) %*% alpha_current,
	nrow = numData,
	ncol = numClust
      )
      # eq (18) in ICMLD16 paper
      log_normProbMat_current <- log(alpha_mat_current) + log_probMat - log(probSum)
      # denominator of eq (20) in ICMLD16 paper
      log_weightMat <- log_normProbMat_current - log_probMat_denom

      # eq (19) in ICMLD16 paper
      alpha_current <- colSums(exp(log_normProbMat_current)) / numData
      # numerator of fraction in eq (21) of ICMLD16 paper
      mu_num_sum_MAT <- t(exp(log_weightMat)) %*% dat
      mu_denom <- apply(mu_num_sum_MAT, 1, norm_vec)
      # eq (21) of ICMLD16 paper without sign function
      mu_current <- mu_num_sum_MAT / mu_denom
      for(h in 1:numClust){
        # update rho params
        sum_h_weightMat <- sum(exp(log_weightMat[,h]))
	alpha_current_h <- alpha_current[h]
	mu_denom_h <- mu_denom[h]
        root_func <- function(x) {
          (-2*numData*x*alpha_current_h)/(1 - x^2) + numVar*mu_denom_h -numVar*x*sum_h_weightMat
        }
        rho_current[h] <- (uniroot(
	  f = root_func,
	  interval = c(0,1),
	  f.lower = root_func(0),
	  f.upper = root_func(1),
	  tol = 0.001
	))$root
      } # for h
 
      # Terminate iterations if maxIter is reached. Redundant with while
      # condition, but this ensures that the stored numIter below is accurate.
      if (currentIter >= maxIter) {
        break
      }
 
      # Terminate iterations if membership is unchanged, if applicable
      if (checkMembership) {
        # calculate and store group membership
        membPrevious <- membCurrent
        membCurrent <- apply(exp(log_normProbMat_current), 1, which.max)
        if (all(membPrevious == membCurrent)) {
          break
        }
      }
 

      # Terminate iterations if log-likelihood is unchanged, if applicable.
      if (checkLoglik) {
        loglikPrevious <- loglikCurrent
        # Calculate log likelihood for the current iteration
        loglikCurrent <- sum(log( exp(log_probMat) %*% alpha_current ))
        if (abs(loglikPrevious - loglikCurrent) < LL_TOL) {
          break
        }
      }
 
      # Update counter to NEXT iteration number.
      currentIter <- currentIter + 1
    } # for currentIter

    # Compare the current log likelihood to the best stored log likelihood;
    # if it exceeds the stored, then store it and all associated estimates. 
    # alpha, rho, mu, normprobMat
    loglikCurrent <- sum(log( exp(log_probMat) %*% alpha_current ))
    if (loglikCurrent > max(logLikVec)) {
      alpha_best <- alpha_current
      rho_best <- rho_current
      mu_best <- mu_current
      normprobMat_best <- exp(log_normProbMat_current)
    }
    logLikVec[init] <- loglikCurrent
    numIterPerRun[init] <- currentIter - 1

    # Store run info

  } # for init

  # Calculate final membership
  memb_best <- apply(normprobMat_best, 1, which.max)

  # Store and return results object.
  results <- list(
    postProbs = normprobMat_best,
    logLik = max(logLikVec),
    params = list(mu = mu_best, rho = rho_best, alpha = alpha_best),
    finalMemb = memb_best,
    runInfo = list(
      logLikVec = logLikVec,
      numIterPerRun = numIterPerRun
    ),
    input = list(
      numClust = numClust,
      maxIter = maxIter,
      stoppingRule = stoppingRule,
      initMethod = initMethod,
      numInit = numInit
    )
  )
  return(results)
} # pkbd
ahfoss/pkbd documentation built on June 4, 2017, 3:20 a.m.