R/computeInformation.R
In miic: Learning Causal or Non-Causal Graphical Models Using Information Theory

Documented in computeMutualInfo computeThreePointInfo

#*******************************************************************************
# Filename   : computeInformation.R
#
# Description: Compute 2 and 3 point (conditional) mutual information
#*******************************************************************************

#===============================================================================
# FUNCTIONS
#===============================================================================
# computeMutualInfo
#-------------------------------------------------------------------------------
#' Compute (conditional) mutual information
#' @description For discrete or categorical variables, the (conditional)
#' mutual information is computed using the empirical frequencies minus a
#' complexity cost (computed as BIC or with the Normalized Maximum Likelihood).
#' When continuous variables are present, each continuous variable is
#' discretized for each mutual information estimate so as to maximize the
#' mutual information minus the complexity cost (see Cabeli 2020).
#'
#' @details For a pair of continuous variables \eqn{X} and \eqn{Y}, the mutual
#' information \eqn{I(X;Y)} will be computed iteratively. In each iteration, the
#' algorithm optimizes the partitioning of \eqn{X} and then of \eqn{Y},
#' in order to maximize
#' \deqn{Ik(X_{d};Y_{d}) = I(X_{d};Y_{d}) - cplx(X_{d};Y_{d})}
#' where \eqn{cplx(X_{d}; Y_{d})} is the complexity cost of the corresponding
#' partitioning (see Cabeli 2020).
#' Upon convergence, the information terms \eqn{I(X_{d};Y_{d})}
#' and \eqn{Ik(X_{d};Y_{d})}, as well as the partitioning of \eqn{X_{d}}
#' and \eqn{Y_{d}} in terms of cutpoints, are returned.
#'
#' For conditional mutual information with a conditioning set \eqn{U}, the
#' computation is done based on
#' \deqn{
#'   Ik(X;Y|U) = 0.5*(Ik(X_{d};Y_{d},U_{d}) - Ik(X_{d};U_{d})
#'                  + Ik(Y_{d};X_{d},U_{d}) - Ik(Y_{d};U_{d})),
#' }
#' where each of the four summands is estimated separately.
#'
#' @references
#' \itemize{
#' \item Cabeli \emph{et al.}, PLoS Comput. Biol. 2020, \href{https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007866}{Learning clinical networks from medical records based on information estimates in mixed-type data}
#' \item Affeldt \emph{et al.}, UAI 2015, \href{https://auai.org/uai2015/proceedings/papers/293.pdf}{Robust Reconstruction of Causal Graphical Models based on Conditional 2-point and 3-point Information}
#' }
#'
#' @param x [a vector]
#' The \eqn{X} vector that contains the observational data of the first variable.
#' @param y [a vector]
#' The \eqn{Y} vector that contains the observational data of the second variable.
#' @param df_conditioning [a data frame]
#' The data frame of the observations of the conditioning variables.
#' @param maxbins [an integer]
#' When the data contain continuous variables, the maximum number of bins
#' allowed during the discretization. A smaller number makes the computation
#' faster, a larger number allows finer discretization.
#' @param cplx [a string]
#' The complexity model:
#' \itemize{
#' \item["bic"] Bayesian Information Criterion
#' \item["nml"] Normalized Maximum Likelihood, more accurate complexity cost
#' compared to BIC, especially on small sample size.
#' }
#' @param n_eff [an integer]
#' The effective number of samples. When there is significant autocorrelation
#' between successive samples, you may want to specify an effective number of
#' samples that is lower than the total number of samples.
#' @param sample_weights [a vector of floats]
#' Individual weights for each sample, used for the same reason as the effective
#' number of samples but with individual weights.
#' @param is_continuous [a vector of booleans]
#' Specify if each variable is to be treated as continuous (TRUE) or discrete
#' (FALSE), must be of length `ncol(df_conditioning) + 2`, in the order
#' \eqn{X, Y, U1, U2, ...}. If not specified, factors and character vectors are
#' considered as discrete, and numerical vectors as continuous.
#' @param plot [a boolean]
#' Specify whether the resulting XY optimum discretization is to be plotted
#' (requires `ggplot2` and `gridExtra`).
#'
#' @return A list that contains :
#' \itemize{
#' \item cutpoints1: Only when \eqn{X} is continuous, a vector containing
#'   the cutpoints for the partitioning of \eqn{X}.
#' \item cutpoints2: Only when \eqn{Y} is continuous, a vector containing
#'   the cutpoints for the partitioning of \eqn{Y}.
#' \item n_iterations: Only when at least one of the input variables is
#'   continuous, the number of iterations it takes to reach the convergence of
#'   the estimated information.
#' \item iteration1, iteration2, ... Only when at least one of the input
#'   variables is continuous, the list of vectors of cutpoints of each
#'   iteration.
#' \item info: The estimation of (conditional) mutual information without the
#' complexity cost.
#' \item infok: The estimation of (conditional) mutual information with the
#' complexity cost (\eqn{Ik = I - cplx}).
#' \item plot: Only when `plot == TRUE`, the plot object.
#' }
#' @export
#' @useDynLib miic
#' @importFrom stats density sd
#'
#' @examples
#' library(miic)
#' N <- 1000
#' # Dependence, conditional independence : X <- Z -> Y
#' Z <- runif(N)
#' X <- Z * 2 + rnorm(N, sd = 0.2)
#' Y <- Z * 2 + rnorm(N, sd = 0.2)
#' res <- computeMutualInfo(X, Y, plot = FALSE)
#' message("I(X;Y) = ", res$info)
#' res <- computeMutualInfo(X, Y, df_conditioning = matrix(Z, ncol = 1), plot = FALSE)
#' message("I(X;Y|Z) = ", res$info)
#'
#' \donttest{
#' # Conditional independence with categorical conditioning variable : X <- Z -> Y
#' Z <- sample(1:3, N, replace = TRUE)
#' X <- -as.numeric(Z == 1) + as.numeric(Z == 2) + 0.2 * rnorm(N)
#' Y <- as.numeric(Z == 1) + as.numeric(Z == 2) + 0.2 * rnorm(N)
#' res <- miic::computeMutualInfo(X, Y, cplx = "nml")
#' message("I(X;Y) = ", res$info)
#' res <- miic::computeMutualInfo(X, Y, matrix(Z, ncol = 1), is_continuous = c(TRUE, TRUE, FALSE))
#' message("I(X;Y|Z) = ", res$info)
#'
#'
#' # Independence, conditional dependence : X -> Z <- Y
#' X <- runif(N)
#' Y <- runif(N)
#' Z <- X + Y + rnorm(N, sd = 0.1)
#' res <- computeMutualInfo(X, Y, plot = TRUE)
#' message("I(X;Y) = ", res$info)
#' res <- computeMutualInfo(X, Y, df_conditioning = matrix(Z, ncol = 1), plot = TRUE)
#' message("I(X;Y|Z) = ", res$info)
#' }
#-------------------------------------------------------------------------------
computeMutualInfo <- function(x, y,
                              df_conditioning = NULL,
                              maxbins = NULL,
                              cplx = c("nml", "bic"),
                              n_eff = -1,
                              sample_weights = NULL,
                              is_continuous = NULL,
                              plot = FALSE) {
  cplx <- tryCatch(
    {match.arg(cplx)},
    error = function(e) {
      if (grepl("object .* not found", e$message)) {
        message(e, "")
        return("")
      }
      return(toString(cplx))
    }
  )
  cplx <- match.arg(cplx)

  input_data = data.frame(x, y)
  if (!is.null(df_conditioning)) {
    input_data <- data.frame(input_data, df_conditioning)
  }

  if (!is.null(sample_weights) && length(sample_weights) != nrow(input_data)) {
    stop(paste(
      "Differing number of rows between `sample_weights` and input data:",
      length(sample_weights),
      length(x)
    ))
  }

  complete_row <- stats::complete.cases(input_data)
  n_rows_na <- sum(!complete_row)
  if (n_rows_na > 0) {
    input_data <- input_data[complete_row, ]
    warning(paste0(
      "Removed ", n_rows_na, " rows containing at least one NA value."
    ))
  }

  n_samples <- nrow(input_data)
  n_nodes <- ncol(input_data)

  if (n_samples < 3) {
    stop(paste0("Insufficient number of complete rows: ", nrow(input_data)))
  }

  if (is.null(is_continuous)) {
    is_continuous <- sapply(input_data, is.numeric)
  } else if (length(is_continuous) != n_nodes) {
    stop(paste(
      "Length of `is_continuous` does not match number of input variables:",
      length(is_continuous),
      n_nodes
    ))
  }

  # Numeric factor matrix, level starts from 0
  input_factor <- as.matrix(apply(input_data, 2,
    function(x) (as.numeric(factor(x, levels = unique(x))) - 1))
  )
  max_level_list <- as.numeric(apply(input_factor, 2, max)) + 1
  # Data list, numeric for continuous columns, -1 for discrete columns
  input_double <- matrix(nrow = n_samples, ncol = n_nodes)
  # Order list, order(column) for continuous columns (index starting from 0),
  # -1 for discrete columns
  input_order <- matrix(nrow = n_samples, ncol = n_nodes)
  for (i in c(1: n_nodes)) {
    if (is_continuous[i]) {
      input_double[, i] <- as.numeric(input_data[, i])
      input_order[, i] <- order(input_data[, i], na.last=NA) - 1
    } else {
      input_double[, i] <- rep_len(-1, n_samples)
      input_order[, i] <- rep_len(-1, n_samples)
    }
  }

  arg_list <- list(
    "cplx" = cplx,
    "is_continuous" = is_continuous,
    "levels" = max_level_list,
    "n_eff" = n_eff,
    "n_nodes" = n_nodes,
    "n_samples" = n_samples
  )
  # Continuous variables will be discretized during the computation
  if (any(is_continuous)) {
    initbins <- min(30, round(n_samples**(1 / 3)))
    if (is.null(maxbins) || maxbins > n_samples || maxbins < initbins) {
      maxbins <- min(n_samples, 5 * initbins, 50)
    }
    arg_list[["max_bins"]] <- maxbins
  }
  if (!is.null(sample_weights)) {
    arg_list[["sample_weights"]] <- sample_weights[complete_row, ]
  }
  cpp_input <- list(
    "factor" = as.vector(input_factor),
    "double" = as.vector(input_double),
    "order" = as.vector(input_order)
  )
  # Call cpp code
  rescpp <- mydiscretizeMutual(cpp_input, arg_list)

  result <- list()
  result$info <- rescpp$info
  result$infok <- rescpp$infok

  X_num <- if (is_continuous[1]) input_double[, 1] else input_factor[, 1]
  Y_num <- if (is_continuous[2]) input_double[, 2] else input_factor[, 2]

  if (any(is_continuous)) {
    # Parse cutpointsmatrix
    epsilon <- min(c(sd(X_num), sd(Y_num))) / 100
    niterations <- nrow(rescpp$cutpointsmatrix) / maxbins
    result$n_iterations <- niterations
    for (i in 0:(niterations - 1)) {
      result[[paste0("iteration", i + 1)]] <- list()
      for (l in 1:2) {
        if (!is_continuous[l]) next

        data <- if (l == 1) X_num else Y_num
        clean_cutpoints <- rescpp$cutpointsmatrix[, l][(maxbins*i) + (1:maxbins)]
        clean_cutpoints <- clean_cutpoints[clean_cutpoints != -1]
        clean_cutpoints <- sort(data)[clean_cutpoints + 1]

        uniquedata <- sort(unique(data))
        if (length(clean_cutpoints) > 0) {
          # Take midpoints between two consecutive unique values instead of
          # the values themselves
          clean_cutpoints <- sapply(clean_cutpoints, function(x) {
            if (x < uniquedata[length(uniquedata)]) {
              return((min(uniquedata[uniquedata > x]) +
                max(uniquedata[uniquedata <= x])) / 2)
            } else {
              return(x)
            }
          })
        }
        clean_cutpoints <- c(uniquedata[1] - epsilon, clean_cutpoints)
        if (max(clean_cutpoints) < uniquedata[length(uniquedata)]) {
          clean_cutpoints <- c(
            clean_cutpoints,
            uniquedata[length(uniquedata)] + epsilon
          )
        }
        result[[paste0("iteration", i + 1)]][[paste0("cutpoints", l)]] <-
          clean_cutpoints
      }
    }
    for (l in 1:n_nodes) {
      result[[paste0("cutpoints", l)]] <-
        result[[paste0("iteration", niterations)]][[paste0("cutpoints", l)]]
    }
  }

  if (plot) {
    nameDist1 <- deparse(substitute(x))
    nameDist2 <- deparse(substitute(y))
    if (base::requireNamespace("ggplot2", quietly = TRUE) &&
        base::requireNamespace("gridExtra", quietly = TRUE)) {
      if (all(is_continuous[1:2])) {
        result$plot <- jointplot_hist(X_num, Y_num, result, nameDist1, nameDist2)
      } else if (any(is_continuous[1:2])) {
        result$plot <- barplot_disc(
          input_data[, 1],
          input_data[, 2],
          result,
          !is_continuous,
          nameDist1,
          nameDist2
        )
      } else {
        result$plot <- grid_plot(
          input_data[, 1],
          input_data[, 2],
          nameDist1,
          nameDist2
        )
      }
    } else {
      warning("Plotting requires ggplot2 and gridExtra.")
    }
  }

  return(result)
}

#-------------------------------------------------------------------------------
# computeThreePointInfo
#-------------------------------------------------------------------------------
#' Compute (conditional) three-point information
#' @description Three point information is defined and computed as the
#' difference of mutual information and conditional mutual information, e.g.
#' \deqn{I(X;Y;Z|U) = I(X;Y|U) - Ik(X;Y|U,Z)}
#' For discrete or categorical variables, the three-point information is
#' computed with the empirical frequencies minus a complexity cost
#' (computed as BIC or with the Normalized Maximum Likelihood).
#'
#' @details For variables \eqn{X}, \eqn{Y}, \eqn{Z} and a set of conditioning
#' variables \eqn{U}, the conditional three point information is defined as
#' \deqn{Ik(X;Y;Z|U) = Ik(X;Y|U) - Ik(X;Y|U,Z)}
#' where \eqn{Ik} is the shifted or regularized conditional mutual information.
#' See \code{\link{computeMutualInfo}} for the definition of \eqn{Ik}.
#'
#' @references
#' \itemize{
#' \item Cabeli \emph{et al.}, PLoS Comput. Biol. 2020, \href{https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007866}{Learning clinical networks from medical records based on information estimates in mixed-type data}
#' \item Affeldt \emph{et al.}, UAI 2015, \href{https://auai.org/uai2015/proceedings/papers/293.pdf}{Robust Reconstruction of Causal Graphical Models based on Conditional 2-point and 3-point Information}
#' }
#'
#' @param x [a vector]
#' The \eqn{X} vector that contains the observational data of the first variable.
#' @param y [a vector]
#' The \eqn{Y} vector that contains the observational data of the second variable.
#' @param z [a vector]
#' The \eqn{Z} vector that contains the observational data of the third variable.
#' @param df_conditioning [a data frame]
#' The data frame of the observations of the set of conditioning variables
#' \eqn{U}.
#' @param maxbins [an integer]
#' When the data contain continuous variables, the maximum number of bins
#' allowed during the discretization. A smaller number makes the computation
#' faster, a larger number allows finer discretization.
#' @param cplx [a string]
#' The complexity model:
#' \itemize{
#' \item["bic"] Bayesian Information Criterion
#' \item["nml"] Normalized Maximum Likelihood, more accurate complexity cost
#' compared to BIC, especially on small sample size.
#' }
#' @param n_eff [an integer]
#' The effective number of samples. When there is significant autocorrelation
#' between successive samples, you may want to specify an effective number of
#' samples that is lower than the total number of samples.
#' @param sample_weights [a vector of floats]
#' Individual weights for each sample, used for the same reason as the effective
#' number of samples but with individual weights.
#' @param is_continuous [a vector of booleans]
#' Specify if each variable is to be treated as continuous (TRUE) or discrete
#' (FALSE), must be of length `ncol(df_conditioning) + 3`, in the order
#' \eqn{X, Y, Z, U1, U2, ...}. If not specified, factors and character vectors
#' are considered as discrete, and numerical vectors as continuous.
#'
#' @return A list that contains :
#' \itemize{
#' \item i3: The estimation of (conditional) three-point information without the
#' complexity cost.
#' \item i3k: The estimation of (conditional) three-point information with the
#' complexity cost (\emph{i3k = i3 - cplx}).
#' \item i2: For reference, the estimation of (conditional) mutual information
#' \eqn{I(X;Y|U)} used in the estimation of \emph{i3}.
#' \item i2k: For reference, the estimation of regularized (conditional) mutual
#' information \eqn{Ik(X;Y|U)} used in the estimation of \emph{i3k}.
#' }
#' @export
#' @useDynLib miic
#' @importFrom stats density sd
#'
#' @examples
#' library(miic)
#' N <- 1000
#' # Dependence, conditional independence : X <- Z -> Y
#' Z <- runif(N)
#' X <- Z * 2 + rnorm(N, sd = 0.2)
#' Y <- Z * 2 + rnorm(N, sd = 0.2)
#' res <- computeThreePointInfo(X, Y, Z)
#' message("I(X;Y;Z) = ", res$i3)
#' message("Ik(X;Y;Z) = ", res$i3k)
#'
#' \donttest{
#' # Independence, conditional dependence : X -> Z <- Y
#' X <- runif(N)
#' Y <- runif(N)
#' Z <- X + Y + rnorm(N, sd = 0.1)
#' res <- computeThreePointInfo(X, Y, Z)
#' message("I(X;Y;Z) = ", res$i3)
#' message("Ik(X;Y;Z) = ", res$i3k)
#' }
#-------------------------------------------------------------------------------
computeThreePointInfo <- function(x, y, z,
                              df_conditioning = NULL,
                              maxbins = NULL,
                              cplx = c("nml", "bic"),
                              n_eff = -1,
                              sample_weights = NULL,
                              is_continuous = NULL) {
  cplx <- tryCatch(
    {match.arg(cplx)},
    error = function(e) {
      if (grepl("object .* not found", e$message)) {
        message(e, "")
        return("")
      }
      return(toString(cplx))
    }
  )
  cplx <- match.arg(cplx)

  input_data = data.frame(x, y, z)
  if (!is.null(df_conditioning)) {
    input_data <- data.frame(input_data, df_conditioning)
  }

  if (!is.null(sample_weights) && length(sample_weights) != nrow(input_data)) {
    stop(paste(
      "Differing number of rows between `sample_weights` and input data:",
      length(sample_weights),
      length(x)
    ))
  }

  complete_row <- stats::complete.cases(input_data)
  n_rows_na <- sum(!complete_row)
  if (n_rows_na > 0) {
    input_data <- input_data[complete_row, ]
    warning(paste0(
      "Removed ", n_rows_na, " rows containing at least one NA value."
    ))
  }

  n_samples <- nrow(input_data)
  n_nodes <- ncol(input_data)

  if (n_samples < 3) {
    stop(paste0("Insufficient number of complete rows: ", nrow(input_data)))
  }

  if (is.null(is_continuous)) {
    is_continuous <- sapply(input_data, is.numeric)
  } else if (length(is_continuous) != n_nodes) {
    stop(paste(
      "Length of `is_continuous` does not match number of input variables:",
      length(is_continuous),
      n_nodes
    ))
  }

  # Numeric factor matrix, level starts from 0
  input_factor <- as.matrix(apply(input_data, 2,
    function(x) (as.numeric(factor(x, levels = unique(x))) - 1))
  )
  max_level_list <- as.numeric(apply(input_factor, 2, max)) + 1
  # Data list, numeric for continuous columns, -1 for discrete columns
  input_double <- matrix(nrow = n_samples, ncol = n_nodes)
  # Order list, order(column) for continuous columns (index starting from 0),
  # -1 for discrete columns
  input_order <- matrix(nrow = n_samples, ncol = n_nodes)
  for (i in c(1: n_nodes)) {
    if (is_continuous[i]) {
      input_double[, i] <- as.numeric(input_data[, i])
      input_order[, i] <- order(input_data[, i], na.last=NA) - 1
    } else {
      input_double[, i] <- rep_len(-1, n_samples)
      input_order[, i] <- rep_len(-1, n_samples)
    }
  }

  arg_list <- list(
    "cplx" = cplx,
    "is_continuous" = is_continuous,
    "levels" = max_level_list,
    "n_eff" = n_eff,
    "n_nodes" = n_nodes,
    "n_samples" = n_samples
  )
  # Continuous variables will be discretized during the computation
  if (any(is_continuous)) {
    initbins <- min(30, round(n_samples**(1 / 3)))
    if (is.null(maxbins) || maxbins > n_samples || maxbins < initbins) {
      maxbins <- min(n_samples, 5 * initbins, 50)
    }
    arg_list[["max_bins"]] <- maxbins
  }
  if (!is.null(sample_weights)) {
    arg_list[["sample_weights"]] <- sample_weights[complete_row, ]
  }
  cpp_input <- list(
    "factor" = as.vector(input_factor),
    "double" = as.vector(input_double),
    "order" = as.vector(input_order)
  )
  # Call cpp code
  rescpp <- miicRGetInfo3Point(cpp_input, arg_list)

  return(rescpp)
}

grid_plot <- function(X, Y, nameDist1, nameDist2) {
  plot_df <- data.frame(table(X, Y), stringAsFactors = TRUE)
  hist2d <- ggplot2::ggplot(plot_df, ggplot2::aes(x=X, y=Y)) +
    ggplot2::geom_tile(
      ggplot2::aes(fill=plot_df$Freq),
      show.legend = FALSE
    ) +
    ggplot2::scale_fill_gradient2(
      low = "#f4f5fc",
      high = "#0013a3",
      position = "left"
    ) +
    ggplot2::xlab(nameDist1) + ggplot2::ylab(nameDist2) +
    ggplot2::theme_classic()

  g <- ggplot2::ggplot_build(hist2d)
  labels <- g$layout$panel_params[[1]]$y$get_labels()
  labels <- labels[labels != "NA"]

  side_hist_top <- ggplot2::ggplot(data.frame(X), ggplot2::aes(x = X)) +
    ggplot2::geom_bar(color="black", fill="white") +
    theme_side_hist() +
    ggplot2::theme(
      plot.margin = ggplot2::margin(
      5.5, 5.5, -30, 5.5, "pt")) +
    ggplot2::scale_y_continuous(
      labels = labels, # Pass hist2d's labels to align cutpoints on X axis
      breaks = seq(0, 0.1, length.out = length(labels))
    ) +
    ggplot2::ylab("X")

  side_hist_right <- ggplot2::ggplot(data.frame(Y), ggplot2::aes(x = Y)) +
    ggplot2::geom_bar(color="black", fill="white") +
    theme_side_hist() +
    ggplot2::theme(
      plot.margin = ggplot2::margin(
      5.5, 5.5, 5.5, -30, "pt")) +
    ggplot2::scale_y_continuous(expand = c(0, 0)) +
    ggplot2::ylab("Y") +
    ggplot2::coord_flip()

  empty <- ggplot2::ggplot() +
    ggplot2::geom_point(ggplot2::aes(1, 1), colour = "white") +
    theme_side_hist()

  return(
    gridExtra::grid.arrange(
      side_hist_top,
      empty,
      hist2d,
      side_hist_right,
      ncol = 2,
      nrow = 2,
      widths = c(4.2, 1),
      heights = c(1, 4.2)
    )
  )
}