R/prob_clust_prior.R

Defines functions prob_clust_estep prob_clust_mstep prob_clust_prior

#' Probabilistic clustering helper function
#'
#' Alternating function for probabilistic clustering. Constraints on cluster
#' sizes are given as a prior distribution.
#'
#' @param data A matrix or data.frame containing the data.
#' @param weights A vector of weights for each data point.
#' @param init_mu Parameters (locations) that define the k distributions.
#' @param k The number of clusters.
#' @param prior_cl_sizes All the possible values for cluster sizes.
#' @param prior_prob The corresponding probabilities (sum to 1).
#' @param lambda Outgroup-parameter.
#' @param divide_objects If TRUE, objects can be divided to multiple clusters.
#' @return A list containting the new cluster allocations for each object in data, the new cluster center locations and maximum of the objective function.
#' @keywords internal
prob_clust_prior <- function(data,
                             weights,
                             k,
                             init_mu,
                             prior_cl_sizes,
                             prior_prob,
                             lambda = 0,
                             divide_objects = FALSE) {

  # Number of objects in data
  n <- nrow(data)

  # Weights must be integers
  weights <- round(weights)

  # Equally weighted data
  data_ew <- apply(data, MARGIN = 2, FUN = rep, times = weights)

  # Original id:s for each data point in equally weighted data
  id_ew <- rep(1:n, times = weights)

  # Initial clusters
  clusters <- rep(0,n)

  #Cluster centers
  mu <- init_mu

  max_sim <- 50
  for (iter in 1:max_sim) {
    # Old mu is saved to check for convergence
    old_mu <- mu

    # Clusters in equally weighted data (Allocation-step)
    temp_allocation <- prob_clust_estep(data_ew, mu, k, prior_cl_sizes, prior_prob, lambda)

    # Clusters in equally weighted data
    clusters_ew <- temp_allocation[[1]]

    # Outgroup-clusters (cluster 99)
    clusters_ew <- ifelse(clusters_ew == (k+1), 99, clusters_ew)

    # Maximum value of the objective function
    obj_max <- temp_allocation[[2]]

    # Updating cluster centers (Parameter-step)
    mu <- prob_clust_mstep(data_ew, clusters_ew, k)

    # Printing the iteration lap
    print(paste("Iteration:",iter))

    # If nothing is changing, stop
    if(all(old_mu == mu)) break
  }

  if(!divide_objects) {
    # Converting clusters_ew to original data
    for (i in 1:n) {
      clusters[i] <- getmode(clusters_ew[id_ew == i])
    }
  } else {
    # Objects can be divided to multiple clusters
    clusters <- clusters_ew
  }
  return(list(clusters = clusters, centers = mu, obj = obj_max))
}

#' Update the parameters (centers) for each cluster.
#'
#' @param data_ew A matrix or data.frame containing the data.
#' @param clusters_ew A vector of cluster assignments for each data point.
#' @param k The number of clusters.
#' @return New cluster centers.
#' @keywords internal
prob_clust_mstep <- function(data_ew, clusters_ew, k){

  # Matrix for cluster centers
  mu <- matrix(0, nrow = k, ncol = length(data_ew[1,]))

  # Update mu given
  for (i in 1:k) {
    mu[i,] <- c(mean(data_ew[clusters_ew == i, 1]), mean(data_ew[clusters_ew == i, 2]))
  }

  return(mu)
}

#' Update cluster allocations by maximizing the joint log-likelihood (Allocation-step).
#'
#' @param data_ew A matrix or data.frame containing the data, where each object is considered to be equally weighted.
#' @param mu Parameters (locations) that define the k distributions.
#' @param k The number of clusters.
#' @param prior_cl_sizes All the possible values for cluster sizes.
#' @param prior_prob Corresponding probabilities (sum to 1).
#' @param lambda Outgroup-parameter.
#' @return New cluster allocations for each object in data_ew
#' @keywords internal
prob_clust_estep <- function(data_ew, mu, k, prior_cl_sizes, prior_prob, lambda) {

  # Number of objects in data_ew
  n <- length(data_ew[,1])

  # Log priors for possible cluster sizes
  prior_log_prob <- log(prior_prob)

  # Matrix contains the log-likelihoods of the individual data points
  C <- apply(mu, MARGIN = 1, FUN = mvtnorm::dmvnorm, x = data_ew, sigma = diag(2), log = TRUE)

  # Number of possible cluster sizes
  A <- length(prior_cl_sizes)

  # New linear program model object containing, where the number of decision variables is n * k + k * A
  lp1 <- lpSolveAPI::make.lp(nrow = 0, ncol = n * k + k * A)

  # Maximizing the joint log-likelihood
  lpSolveAPI::lp.control(lp1, sense = "max")

  # Binary integer problem
  lpSolveAPI::set.type(lp1, 1:(n*k + k*A), "binary")

  # Objective function in the optimization problem
  lpSolveAPI::set.objfn(lp1, c(C, t(matrix(1, ncol = k, nrow = A) * prior_log_prob)))

  # Constraint 1: each point is assigned to exactly one cluster
  for (i in 1:n) {
    lpSolveAPI::add.constraint(lp1, c(as.numeric(rep(1:n == i, k)), rep(0, k * A)), "=", 1)
  }

  # Constraint 2: each cluster has exactly one size
  for (i in 1:k) {
    lpSolveAPI::add.constraint(lp1, c(rep(0, k * n), as.numeric(rep(1:k == i, A))), "=", 1)
  }

  # Constraint 3: each cluster has S_a points
  temp <- c(t(matrix(1, ncol = n, nrow = k) * 1:k))
  temp2 <- c(t(-matrix(1, ncol = k, nrow = A) * prior_cl_sizes))
  for (i in 1:k) {
    lpSolveAPI::add.constraint(lp1, c(as.numeric(temp == i, n), temp2 * as.numeric(rep(1:k == i, A))), "=", 0)
  }

  # Solving the optimization problem
  solve(lp1)

  # Maximum of the objective function
  obj_max <- round(lpSolveAPI::get.objective(lp1), digits = 2)

  # Print the value of the objective function
  print(paste("Value of the objective function:", obj_max))

  return(list(apply(matrix(lpSolveAPI::get.variables(lp1)[1:(n * k)], ncol = k), 1, which.max), obj_max))
}
terolahderanta/Probabilistic_clustering documentation built on April 22, 2021, 8:44 p.m.