R/utils.R

Defines functions log_sum_exp Tr getall evaluation_metrics generate_patterns hide_values mean_impute initialize_clusters cluster_pars clusters_to_matrix

Documented in evaluation_metrics generate_patterns hide_values initialize_clusters mean_impute

###########################################################################
###                                                                     ###
###                    Cluster Memberships to Matrix                    ###
###                                                                     ###
###########################################################################

clusters_to_matrix <- function(clusters, G = max(clusters)) {

  if (G %% 1 != 0) {
    stop('Number of clusters G must be an integer')
  }

  if (G < max(clusters)) {
    stop('G must be at least max(clusters)')
  }

  n <- length(clusters)

  if (G == 1) {

    matr <- matrix(1, nrow = n, ncol = 1)

  } else {

    matr <- matrix(0, nrow = n, ncol = G)

    for (g in 1:G) {
      matr[clusters == g, g] <- 1
    }

  }

  return(matr)

}

###########################################################################
###                                                                     ###
###                    Sample Statistics of Clusters                    ###
###                                                                     ###
###########################################################################

cluster_pars <- function(
    X,
    clusters
) {

  #---------------------#
  #    Input Checking   #
  #---------------------#

  if (is.data.frame(X)) {
    X <- as.matrix(X)
  }

  if (!is.matrix(X)) {
    X <- matrix(X, nrow = length(X), ncol = 1)
  }

  if (!is.numeric(X)) {
    stop('X must be a numeric matrix, data frame or vector')
  }

  if (length(clusters) != nrow(X)) {
    stop('clusters must match the number of observations')
  }

  if ( !is.vector(clusters) | !is.numeric(clusters) ) {
    stop('clusters must be a numeric vector')
  }

  # if (!all(clusters %in% 1:G)) {
  #   stop('All cluster memberships 1:G must be present')
  # }

  #--------------------------#
  #    Extract Parameters    #
  #--------------------------#

  n <- nrow(X)
  d <- ncol(X)
  G <- max(clusters)

  N <- table( factor(clusters, levels = 1:G) )

  if (any(N == 0)) {
    stop('At least one cluster has no observations')
  }

  py <- N / n

  incomplete <- any(is.na(X))

  if (incomplete) {
    cc       <- complete.cases(X)
    X        <- X[cc, ]
    clusters <- clusters[cc]
  }

  mu    <- matrix(NA_real_, nrow = G, ncol = d)
  Sigma <- array(NA_real_, dim = c(d, d, G))

  for (g in 1:G) {
    mu[g, ] <- colMeans(X[clusters == g, , drop = FALSE])

    if (N[g] == 1) {
      Sigma[, , g] <- matrix(0, nrow = d, ncol = d)
    } else {
      Sigma[, , g] <- var(X[clusters == g, , drop = FALSE])
    }

  }

  #---------------------------------#
  #    Prepare and return output    #
  #---------------------------------#

  output <- list(
    size  = N,
    pi    = py,
    mu    = mu,
    Sigma = Sigma
  )

  return(output)

}

###########################################################################
###                                                                     ###
###           Cluster Initialization using a Heuristic Method           ###
###                                                                     ###
###########################################################################

#' Cluster Initialization using a Heuristic Method
#'
#' Initialize cluster memberships and component parameters to start the EM algorithm
#' using a heuristic clustering method or user-defined labels.
#'
#' @param X An \eqn{n} x \eqn{d} matrix or data frame where \eqn{n} is the number of
#'   observations and \eqn{d} is the number of columns or variables. Alternately,
#'   \code{X} can be a vector of \eqn{n} observations.
#' @param G The number of clusters, which must be at least 1. If \code{G = 1}, then
#'   user-defined \code{clusters} is ignored.
#' @param init_method (optional) A string specifying the method to initialize
#'   the EM algorithm. "kmedoids" clustering is used by default. Alternative
#'   methods include "kmeans", "hierarchical", "manual". When
#'   "manual" is chosen, a vector \code{clusters} of length \eqn{n} must
#'   be specified. When \code{G = 1} and "kmedoids" clustering is used, the medoid
#'   will be returned, not the sample mean.
#' @param clusters A numeric vector of length \eqn{n} that specifies the initial
#'   cluster memberships of the user when \code{init_method} is set to "manual".
#'   This argument is NULL by default, so that it is ignored whenever other given
#'   initialization methods are chosen.
#'
#' @details Available heuristic methods include k-medoids clustering, k-means clustering,
#'   and hierarchical clustering. Alternately, the user can also enter pre-specified
#'   cluster memberships, making other initialization methods possible. If the given
#'   data set contains missing values, only observations with complete records will
#'   be used to initialize clusters. However, in this case, except when \code{G = 1}, the resulting cluster
#'   memberships will be set to \code{NULL} since they represent those complete records
#'   rather than the original data set as a whole.
#'
#' @return A list with the following slots:
#'   \item{pi}{Component mixing proportions.}
#'   \item{mu}{A \eqn{G} by \eqn{d} matrix where each row is the component mean vector.}
#'   \item{Sigma}{A \eqn{G}-dimensional array where each \eqn{d} by \eqn{d} matrix
#'     is the component covariance matrix.}
#'   \item{clusters}{An numeric vector with values from 1 to \eqn{G} indicating
#'     initial cluster memberships if \code{X} is a complete data set; NULL otherwise.}
#'
#' @references
#' Everitt, B., Landau, S., Leese, M., and Stahl, D. (2011). \emph{Cluster Analysis}. John Wiley & Sons. \cr \cr
#' Kaufman, L. and Rousseeuw, P. J. (2009). \emph{Finding  groups  in  data:  an
#'   introduction  to  cluster analysis}, volume 344. John Wiley & Sons. \cr \cr
#' Hartigan, J. A. and Wong, M. A. (1979). Algorithm AS 136: A K-means clustering
#'  algorithm. \emph{Applied Statistics}, \strong{28}, 100-108. doi: 10.2307/2346830.
#'
#' @examples
#'
#' #++++ Initialization using a heuristic method ++++#
#'
#' set.seed(1234)
#'
#' init <- initialize_clusters(iris[1:4], G = 3)
#' init <- initialize_clusters(iris[1:4], G = 3, init_method = 'kmeans')
#' init <- initialize_clusters(iris[1:4], G = 3, init_method = 'hierarchical')
#'
#' #++++ Initialization using user-defined labels ++++#
#'
#' init <- initialize_clusters(iris[1:4], G = 3, init_method = 'manual',
#'                             clusters = as.numeric(iris$Species))
#'
#' #++++ Initial parameters and pairwise scatterplot showing the mapping ++++#
#'
#' init$pi
#' init$mu
#' init$Sigma
#' init$clusters
#'
#' pairs(iris[1:4], col = init$clusters, pch = 16)
#'
#' @import cluster
#' @export
initialize_clusters <- function(
    X,
    G,
    init_method = c("kmedoids", "kmeans", "hierarchical", "mclust", "manual"),
    clusters    = NULL
) {

  #----------------------#
  #    Input checking    #
  #----------------------#

  if (G < 1) {
    stop('Number of clusters G must be at least 1')
  }

  if (G %% 1 != 0) {
    stop('Number of clusters G must be an integer')
  }

  if (is.data.frame(X)) {
    X <- as.matrix(X)
  }

  if (!is.matrix(X)) {
    X <- matrix(X, nrow = length(X), ncol = 1)
  }

  if (!is.numeric(X)) {
    stop('X must be a numeric matrix')
  }

  init_method <- match.arg(init_method)

  #------------------------------#
  #    Cluster Initialization    #
  #------------------------------#

  n <- nrow(X)
  d <- ncol(X)

  incomplete <- any(is.na(X))

  if (incomplete) {
    cc <- complete.cases(X)
    X  <- X[cc, ]

    if (init_method == 'manual') {
      clusters <- clusters[cc]
    }
  }

  if (G == 1) {

    clusters <- rep(1, n)

    if (init_method == 'kmedoids') {
      kmed <- pam(X, G)
    }

  } else {

    #-----------------#
    #    K-Medoids    #
    #-----------------#

    if (init_method == 'kmedoids') {
      kmed     <- pam(X, G)
      clusters <- kmed$clustering
    }

    #---------------#
    #    K-Means    #
    #---------------#

    if (init_method == 'kmeans') {
      km       <- kmeans(X, G, nstart = 10)
      clusters <- km$cluster
    }

    #-------------------------------#
    #    Hierarchical Clustering    #
    #-------------------------------#

    if (init_method == 'hierarchical') {
      hc       <- hclust(dist(X), method = "ward.D")
      clusters <- cutree(hc, k = G)
    }

    #---------------------------------#
    #    Gaussian Mixture (mclust)    #
    #---------------------------------#

    if (init_method == 'mclust') {

      if (d == 1) {
        mc <- mclust::Mclust(X, G = G, modelNames = 'V', verbose = FALSE)

        pi       <- mc$parameters$pro
        mu       <- matrix(mc$parameters$mean, nrow = G, ncol = 1)
        Sigma    <- array(mc$parameters$variance$sigma, dim = c(1, 1, G))
        clusters <- mc$classification
      }

      if (d > 1) {
        mc <- mclust::Mclust(X, G = G, modelNames = 'VVV', verbose = FALSE)

        pi       <- mc$parameters$pro
        mu       <- t(mc$parameters$mean)
        Sigma    <- mc$parameters$variance$sigma
        clusters <- mc$classification
      }

    }

    #-------------------------------#
    #    User-Defined Clustering    #
    #-------------------------------#

    if (init_method == 'manual') {

      if (is.null(clusters)) {
        stop('clusters must be specified if manual initialization is chosen')
      }

      if ( !is.vector(clusters) | !is.numeric(clusters) ) {
        stop('clusters must be a numeric vector')
      }

      if (length(clusters) != nrow(X)) {
        stop('clusters must match the number of observations')
      }

      if (!all(clusters %in% 1:G)) {
        stop('All cluster memberships 1:G must be present')
      }

    }

  }

  if (init_method != 'mclust') {
    pars <- cluster_pars(X, clusters)

    pi <- pars$pi

    if (init_method == 'kmedoids') {
      mu <- kmed$medoids
    } else {
      mu <- pars$mu
    }

    Sigma <- pars$Sigma
  }

  #---------------------#
  #    Prepare Output   #
  #---------------------#

  if (incomplete & G > 1) {
    clusters <- NULL
  }

  output <- list(
    pi       = pi,
    mu       = mu,
    Sigma    = Sigma,
    clusters = clusters
  )

  return(output)

}

###########################################################################
###                                                                     ###
###                           Mean Imputation                           ###
###                                                                     ###
###########################################################################

#' Mean Imputation
#'
#' Replace missing values of data set by the mean of other observed values.
#'
#' @param X An \eqn{n} x \eqn{d} matrix or data frame where \eqn{n} is the number of
#'   observations and \eqn{d} is the number of columns or variables. Alternately,
#'   \code{X} can be a vector of \eqn{n} observations.
#'
#' @return A complete data matrix with missing values imputed accordingly.
#'
#' @references
#' Schafer, J. L. and Graham, J. W. (2002). Missing data: our view of the state of the art.
#'   \emph{Psychological Methods}, 7(2):147–177.  \cr \cr
#' Little, R. J. A. and Rubin, D. B. (2020). \emph{Statistical analysis with missing data}.
#'   Wiley Series in Probability and Statistics. Wiley, Hoboken, NJ, 3rd edition
#'
#' @examples
#'
#' X <- matrix(nrow = 6, ncol = 3, byrow = TRUE, c(
#'   NA,  2,  2,
#'    3, NA,  5,
#'    4,  3,  2,
#'   NA, NA,  3,
#'    7,  2, NA,
#'   NA,  4,  2
#' ))
#'
#'
#' mean_impute(X)
#'
#' @export
mean_impute <- function(X) {

  #---------------------#
  #    Input Checking   #
  #---------------------#

  if (is.data.frame(X)) {
    X <- as.matrix(X)
  }

  if (!is.matrix(X)) {
    X <- matrix(X, nrow = length(X), ncol = 1)
  }

  if (!is.numeric(X)) {
    stop('X must be a numeric matrix, data frame or vector')
  }

  #-----------------------#
  #    Mean Imputation    #
  #-----------------------#

  n <- nrow(X)
  d <- ncol(X)

  centroid <- colMeans(X, na.rm = TRUE)

  for (h in 1:d) {
    X[!complete.cases(X[, h]), h] <- centroid[h]
  }

  return(X)

}


###################################################################
###                                                             ###
###       Randomly introduce missing values to a data set       ###
###                                                             ###
###################################################################

#' Missing Values Generation
#'
#' A convenient function that randomly introduces missing values to an at-least-bivariate
#' data set. The user can specify either the proportion of observations that contain some
#' missing values or the exact number of observations that contain some missing values.
#' Note that the function does not guarantee that underlying missing-data
#' mechanism to be missing at random (MAR).
#'
#' @param X An \eqn{n} by \eqn{d} matrix or data frame where \eqn{n} is the number of
#'   observations and \eqn{d} is the number of columns or variables. \code{X} must
#'   have at least 2 rows and 2 columns.
#' @param prop_cases (optional) Proportion of observations that contain some missing values.
#'   \code{prop_cases} must be a number in \eqn{(0, 1)}. \code{prop_cases = 0.1}
#'   by default, but will be ignored if \code{n_cases} is specified.
#' @param n_cases (optional) Number of observations that contain some missing values.
#'   \code{n_cases} must be an integer ranging from 1 to \code{nrow(X) - 1}.
#'
#' @return The orginal \eqn{n} by \eqn{d} matrix or data frame with missing values.
#'
#' @details If subject to missingness, an observation can have at least 1 and at
#'   most \code{ncol(X) - 1} missing values. Depending on the data
#'   set, it is not guaranteed that the resulting matrix will have the number of
#'   rows with missing values matches the specified proportion.
#'
#' @examples
#' set.seed(1234)
#'
#' hide_values(iris[1:4])
#' hide_values(iris[1:4], prop_cases = 0.5)
#' hide_values(iris[1:4], n_cases = 80)
#'
#' @export
hide_values <- function(
    X,                # numeric data matrix or data frame
    prop_cases = 0.1, # proportion of observations subject to missingness
    n_cases = NULL    # number of observations with missing values; if specified, prop_cases is ignored
) {

  if (!is.matrix(X) & !is.data.frame(X)) {
    stop('X must be a matrix or data frame')
  }

  n <- nrow(X)
  d <- ncol(X)

  if (n < 2) {
    stop('X must have at least 2 rows')
  }

  if (d < 2) {
    stop('X must have at least 2 columns')
  }

  if (is.null(n_cases)) {
    if (prop_cases <= 0 | prop_cases >= 1) {
      stop('prop_cases must be in (0, 1)')
    }

    n_cases <- floor(prop_cases * n)

    if (n_cases < 1) {
      stop('prop_cases is too small')
    }
  } else {
    if (n_cases < 1 | n_cases > n - 1 | n_cases %% 1 != 0) {
      stop('n_cases must be an integer in (1, n - 1)')
    }
  }

  indices_to_hide <- sample(1:n, n_cases)

  for (i in indices_to_hide) {
    n_cols_to_hide     <- sample(1:(d - 1), 1)
    cols_to_hide       <- sample(1:d, n_cols_to_hide)
    X[i, cols_to_hide] <- NA
  }

  return(X)
}

##########################################################
###                                                    ###
###       Generate general missing-data patterns       ###
###                                                    ###
##########################################################

#' Missing-Data Pattern Generation
#'
#' Generate all possible missing patterns in a multivariate data set. The function
#' can be used to complement the function \code{ampute()} from package \code{mice}
#' in which a matrix of patterns is needed to allow for general missing-data
#' patterns with missing-data mechanism missing at random (MAR). Using this
#' function, each observation can have more than one missing value.
#'
#' @param d The number of variables or columns of the data set. \code{d} must be
#'   an integer greater than 1.
#'
#' @return A matrix where 0 indicates that a variable should have missing values
#'   and 1 indicates that a variable should remain complete. This matrix has \code{d}
#'   columns and \eqn{2^d - 2} rows.
#'
#' @details An observation cannot have all values missing values. A complete observation
#'   is not qualified for missing-data pattern. Note that a large value of \code{d} may
#'   result in memory allocation error.
#'
#' @examples
#' generate_patterns(4)
#'
#' #++++ To use with the function ampute() from package mice ++++#
#' library(mice)
#'
#' patterns_matr <- generate_patterns(4)
#' data_missing <- ampute(iris[1:4], prop = 0.5, patterns = patterns_matr)$amp
#'
#' @importFrom mice ampute
#' @export
generate_patterns <- function(d) {

  if (d < 2) {
    stop('Number of variables must be at least 2')
  }

  if (d %% 1 != 0) {
    stop('Number of variables d must be an integer')
  }

  matr <- expand.grid(replicate(d, 0:1, simplify = FALSE))[c(-1, -2^d), ]
  matr <- as.matrix(matr)
  matr <- matr[order(rowSums(matr), decreasing = F), ]
  matr <- matr[nrow(matr):1, ]

  matr           <- as.matrix(matr)
  rownames(matr) <- NULL
  colnames(matr) <- NULL

  return(matr)

}

############################################################################
###                                                                      ###
###                   Binary Classification Evaluation                   ###
###                                                                      ###
############################################################################

#' Binary Classification Evaluation
#'
#' Evaluate the performance of a classification model by comparing its predicted
#' labels to the true labels. Various metrics are returned to give an insight on
#' how well the model classifies the observations. This function is added to aid
#' outlier detection evaluation of MCNM and MtM in case that true outliers are
#' known in advance.
#'
#' @param true_labels An 0-1 or logical vector denoting the true labels. The
#'   meaning of 0 and 1 (or TRUE and FALSE) is up to the user.
#' @param pred_labels An 0-1 or logical vector denoting the true labels. The
#'   meaning of 0 and 1 (or TRUE and FALSE) is up to the user.
#'
#' @return A list with the following slots:
#'   \item{matr}{The confusion matrix built upon true labels and predicted labels.}
#'   \item{TN}{True negative.}
#'   \item{FP}{False positive (type I error).}
#'   \item{FN}{False negative (type II error).}
#'   \item{TP}{True positive.}
#'   \item{TPR}{True positive rate (sensitivy).}
#'   \item{FPR}{False positive rate.}
#'   \item{TNR}{True negative rate (specificity).}
#'   \item{FNR}{False negative rate.}
#'   \item{precision}{Precision or positive predictive value (PPV).}
#'   \item{accuracy}{Accuracy.}
#'   \item{error_rate}{Error rate.}
#'   \item{FDR}{False discovery rate.}
#'
#' @examples
#'
#' #++++ Inputs are 0-1 vectors ++++#
#'
#' evaluation_metrics(
#'   true_labels = c(1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1),
#'   pred_labels = c(1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1)
#' )
#'
#' #++++ Inputs are logical vectors ++++#
#'
#' evaluation_metrics(
#'   true_labels = c(TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE),
#'   pred_labels = c(FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE)
#' )
#'
#' @export
evaluation_metrics <- function(
    true_labels,   # a 0-1 or logical vector
    pred_labels    # a 0-1 or logical vector
) {

  #----------------------#
  #    Input checking    #
  #----------------------#

  if (!is.vector(true_labels) | !is.vector(pred_labels)) {
    stop('true_labels and pred_labels must be vectors')
  }

  if (any(is.na(true_labels)) | any(is.na(pred_labels))) {
    stop('true_labels and pred_labels must not contain NA')
  }

  if (is.logical(true_labels)) {
    true_labels <- as.numeric(true_labels)
  }

  if (!is.numeric(true_labels)) {
    stop('true_labels must be numeric or logical')
  }

  if (is.logical(pred_labels)) {
    pred_labels <- as.numeric(pred_labels)
  }

  if (!is.numeric(pred_labels)) {
    stop('pred_labels must be numeric or logical')
  }

  if (length(true_labels) != length(pred_labels)) {
    stop('true_labels and pred_labels must have the same length')
  }

  #------------------------#
  #    Confusion matrix    #
  #------------------------#

  # cat('
  #   Structure of the confusion matrix:
  #
  #                     pred_labels
  #                         0     1
  #   true_labels        0 TN    FP
  #                      1 FN    TP\n\n')

  matr <- matrix(0, nrow = 2, ncol = 2,
                 dimnames = list(c('true_0', 'true_1'),
                                 c('pred_0', 'pred_1')))

  tn <- sum(true_labels == 0 & pred_labels == 0)    # true negative
  fp <- sum(true_labels == 0 & pred_labels == 1)    # false positive
  fn <- sum(true_labels == 1 & pred_labels == 0)    # false negative
  tp <- sum(true_labels == 1 & pred_labels == 1)    # true positive

  matr[1, 1] <- tn
  matr[1, 2] <- fp
  matr[2, 1] <- fn
  matr[2, 2] <- tp

  p   <- fn + tp    # real positive cases
  n   <- tn + fp    # real negative cases

  tpr <- tp / (tp + fn)    # true positive rate (sensitivy)
  fpr <- fp / (fp + tn)    # false positive rate (specificity)
  tnr <- tn / (tn + fp)    # true negative rate
  fnr <- fn / (fn + tp)    # false negative rate

  preci <- tp / (tp + fp)         # precision
  acc   <- (tn + tp) / (p + n)    # accuracy
  err   <- 1 - acc                # error rate
  fdr   <- fp / (fp + tp)         # false discovery rate

  output <- list(
    matr       = matr,
    TN         = tn,
    FP         = fp,
    FN         = fn,
    TP         = tp,
    TPR        = tpr,
    FPR        = fpr,
    TNR        = tnr,
    FNR        = fnr,
    precision  = preci,
    accuracy   = acc,
    error_rate = err,
    FDR        = fdr
  )

  return(output)
}


############################################################################
###                                                                      ###
###       Approximate the Asymptotic Maximum of the Log-Likelihood       ###
###                                                                      ###
############################################################################

getall <- function(loglik) {
  if (length(loglik) < 3) {
    return(Inf)
  }
  n       <- length(loglik)
  lm1     <- loglik[n]
  lm      <- loglik[(n-1)]
  lm_1    <- loglik[(n-2)]
  am      <- (lm1 - lm)/(lm - lm_1)
  lm1.Inf <- lm + (lm1 - lm)/(1-am)
  val     <- lm1.Inf - lm

  if (is.nan(val)) {
    val = 0
  }

  if (val < 0) {
    val = 1
  }

  return( val )
}

############################################################################
###                                                                      ###
###                       Trace of a Square Matrix                       ###
###                                                                      ###
############################################################################

Tr <- function(X) {

  if (!is.matrix(X)) {
    stop('X must be a matrix')
  }

  if (nrow(X) != ncol(X)) {
    stop('X must be a square matrix')
  }

  return(
    sum(diag(X))
  )

}

###########################################################################
###                                                                     ###
###                             Log-Sum-Exp                             ###
###                                                                     ###
###########################################################################

log_sum_exp <- function(l) {

  l_max <- max(l)

  return( l_max + log(sum(exp(l - l_max))) )
}

Try the MixtureMissing package in your browser

Any scripts or data that you put into this service are public.

MixtureMissing documentation built on Oct. 16, 2024, 1:09 a.m.