R/CommonFunctions.R
In CASMI: 'CASMI'-Based Functions

MIz.test <- function(feature, outcome) {

  # 'feature' is the vector of X
  # 'outcome' is the vector of Y
  # 'k1' and 'k2' are the corresponding number of categories
  # X and Y must have the same sample size

  # Considering NA in the feature
  XYtable <- table(feature, outcome)
  n <- sum(XYtable)

  k1 = length(table(feature)) # feature categories
  k2 = length(table(outcome)) # outcome categories

  test.stat = 2 * n * MI.z(XYtable) + (k1 - 1) * (k2 - 1)

  p.value = pchisq(test.stat, (k1 - 1) * (k2 - 1), lower.tail = F)

  return(p.value)
}

g_function <- function(x1, x2, x3) {
  # Return a 3*1 matrix
  vec = c(1 / x2, -(x1 - x3) / x2 ^ 2, -1 / x2)
  return(as.matrix(vec, nrow = 3, ncol = 1))
}

E.CI.smi <- function(XYtable, alpha) {

  # Input the frequency table between the feature and the outcome

  K1 = nrow(XYtable)
  K2 = ncol(XYtable)
  K = K1 * K2
  n = sum(XYtable)

  # Make sure XYtable[K1,K2] is not zero. If zero, switch.
  if (XYtable[K1, K2] == 0) {
    nonZeroIndex = which(XYtable != 0, arr.ind = T)
    i = nonZeroIndex[1, 1]
    j = nonZeroIndex[1, 2]

    tmp = XYtable[i, ]
    XYtable[i, ] = XYtable[K1, ]
    XYtable[K1, ] = tmp

    tmp = row.names(XYtable)[i]
    row.names(XYtable)[i] = row.names(XYtable)[K1]
    row.names(XYtable)[K1] = tmp

    tmp = XYtable[, j]
    XYtable[, j] = XYtable[, K2]
    XYtable[, K2] = tmp

    tmp = colnames(XYtable)[j]
    colnames(XYtable)[j] = colnames(XYtable)[K2]
    colnames(XYtable)[K2] = tmp
  }

  Xmargin = rowSums(XYtable)
  Ymargin = colSums(XYtable)

  # First row of matrix A
  pK1. = Xmargin[K1] / sum(Xmargin)
  derX = vector()
  for (i in 1:(K1 - 1)) {
    pi. =  Xmargin[i] / sum(Xmargin)
    derX[i] = log(pK1.) - log(pi.)
  }

  A1 = vector()
  for (i in 1:(K1 - 1)) {
    for (j in 1:K2) {
      A1 = c(A1, derX[i])
    }
  }
  for (i in ((K1 - 1) * K2 + 1):(K - 1)) {
    A1[i] = 0
  }

  # Second row of matrix A
  p.K2 = Ymargin[K2] / sum(Ymargin)
  derY = vector()
  for (i in 1:(K2 - 1)) {
    p.j = Ymargin[i] / sum(Ymargin)
    derY[i] = log(p.K2) - log(p.j)
  }

  A2 = vector()
  for (i in 1:K1) {
    for (j in 1:K2) {
      if (j == K2) {
        A2 = c(A2, 0)
      } else{
        A2 = c(A2, derY[j])
      }
    }
  }
  A2 = A2[-length(A2)]

  # Third row of matrix A
  pK = XYtable[K1, K2] / n
  A3 = vector()
  for (i in 1:K1) {
    for (j in 1:K2) {
      pk = XYtable[i, j] / n
      if (pk == 0) {
        A3 = c(A3, 0)
      } else{
        A3 = c(A3, log(pK) - log(pk))
      }
    }
  }
  A3 = A3[-length(A3)]

  A = as.matrix(rbind(A1, A2, A3))

  # The Sigma matrix
  p = c(t(XYtable)) / n
  Sigma = matrix(
    data = NA,
    nrow = K - 1,
    ncol = K - 1,
    byrow = TRUE
  )
  for (i in 1:(K - 1)) {
    for (j in 1:(K - 1)) {
      if (i == j) {
        Sigma[i, i] = p[i] * (1 - p[i])
      } else{
        Sigma[i, j] = -p[i] * p[j]
      }
    }
  }

  # Sigma hat
  HXhat = entropy.plugin(Xmargin)
  HYhat = entropy.plugin(Ymargin)
  HXYhat = entropy.plugin(XYtable)
  g = g_function(HXhat, HYhat, HXYhat)
  sigmaHat = sqrt(t(g) %*% A %*% Sigma %*% t(A) %*% g)

  # E in CI
  Zstat = qnorm(alpha / 2, lower.tail = FALSE) # 95% confidence level
  E = Zstat * (sigmaHat / sqrt(n))

  return(E)
}

ftable2xytable <- function(ftable) {
  # Convert an ftable to a regular XYtable where X are joint and Y is the outcome
  return(ftable[rowSums(ftable) != 0, ])
}

Kappas <- function(data, idxFeatures) {
  # Input data, index(es) of feature(s); outcome must be in the last column

  # ftable automatically handles NA values
  tmpIndexFnO = c(idxFeatures, ncol(data)) # Index of features and outcome
  ftable = ftable(table(data[tmpIndexFnO]))

  Xmargin <- rowSums(ftable)
  Ymargin <- colSums(ftable)
  n <- sum(ftable)

  kappa.z = MI.z(ftable) / Entropy.z(Ymargin) # Same as smi.z

  kappaStar = kappa.z * (1 - sum(Xmargin == 1L) / n)

  return(list(kappa.z = kappa.z, kappa.star = kappaStar))
}