R/PRSCC-methods.R

Defines functions NN2 OptProxCC PostKNNByOutlier NearestNeighbour CCDataSim CCWeight Ann4Cc MCC CMetric CFinder ProxSCC

Documented in Ann4Cc CCDataSim CCWeight CFinder CMetric NearestNeighbour NN2 OptProxCC PostKNNByOutlier ProxSCC

### ProxSCC(Proximal Gradient Smoothing Sparse Convex Clustering) solver
##' @title ProxSCC
##' @description Implementing the ProxSCC algorithm for Convex Clustering
##'
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param U Initial of clustering examplers. The rows are features, and the columns are the samples. Default is NULL
##' @param Lambda A regularization parameter for cluster number within penalty term Lambda * \code{w[k]} * |U_{,i} - U_{,j}|_2
##' @param Gamma  A regularization parameter the number of nonzero features within penalty term Gamma * \code{r[k]} * |U_{k,}|_2
##' @param w A vector of nonegative weights. \code{w[k]} denotes the weight for the k-th pair of examplers.
##' @param r The adaptive group lasso's weights for feature sparse penalty.
##' @param p Rows of matrix \code{X}. nrow(X) (Default).
##' @param n Columns of matrix \code{X}. ncol(X) (Default).
##' @param tol The convergence threshold. Default 1e-5.
##' @param maxit The maximum iterations need for algorithms. Default 500.
##' @param verbose False as default.
##'
##' @return
##' \describe{
##' \item{U}{ Centroid matrix of ProxSCC}
##' \item{iters}{ number of iterations}
##' }
##'
##' @export
ProxSCC = function(X = NULL,
                   U = NULL,
                   Lambda,
                   Gamma,
                   w,
                   r,
                   p = nrow(X),
                   n = ncol(X),
                   tol = 1e-5,
                   maxit = 500,
                   verbose = FALSE) {
  if (is.null(X)) {
    stop("Data matrix X should not be NULL...")
  }
}


#' @title CFinder
#' @param U Matrix of Convex Clustering Output
#' @return L Vector of labels
#' @export
CFinder = function(U = NULL, r = 1e-2) {
  L = NULL
  C = NULL
  m = 1
  for (i in 1:ncol(U)) {
    if (m == 1 && i == 1) {
      C = U[, i, drop = FALSE]
      L = c(L, m)
    } else {
      D = apply((C - U[, i])^2, 2, sum)
      j = which(D < r)
      if (length(j) != 0) {
        L = c(L, min(j))
      } else {
        m = m + 1
        C = cbind(C, U[, i, drop = FALSE])
        L = c(L, m)
      }
    }
  }
  list(Center = C, Labels = L)
}


##' @title CMetric
##' @param U Matrix of Convex Clustering Output
##' @param C Cluster assignment for each observation
##' @param labels Labels of ground truth
##' @return
##' RandIndex
##' NMI
##' @importFrom mclust adjustedRandIndex
##' @importFrom NMI NMI
##' @importFrom Matrix Matrix
##' @export
CMetric = function(U, C, Labels, Feature, threshold) {
  RandIndex = adjustedRandIndex(C$Labels, Labels)
  N = seq(1, ncol(U))
  NMI = NMI(data.frame(N, C$Labels), data.frame(N, Labels))
  estFeature = (apply(C$Center**2, 1, sum) > threshold)
  esTD = table(Feature, estFeature)
  TPR = esTD[2, 2] / sum(Feature != 0)
  FPR = esTD[1, 2] / sum(Feature == 0)
  FNR = esTD[2, 1] / sum(Feature != 0)
  TNR = esTD[1, 1] / sum(Feature == 0)
  eMCC = MCC(TPR, FPR, TNR, FNR)
  FDR = esTD[1, 2] / sum(estFeature != 0)
  list(MeasLabel = c(RandIndex = RandIndex, NMI = NMI),
       MeasFeatr = c(MCC = eMCC, TPR = TPR, FDR = FDR, FNR = FNR, TNR = TNR)
  )
}


MCC = function(TPR, FPR, TNR, FNR) {
  P = log2(TPR + FPR) + log2(TPR + FNR) + log(TNR + FPR) + log2(TNR + FNR)
  (TPR * TNR - FPR * FNR) / sqrt(2**P)
}


##' @title Ann4Cc
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param k First k nearest neighbours of X
##' @return list of kNN
##' @importFrom RcppHNSW HnswL2
##' @export
Ann4Cc = function(X, k = 1, W) {
  n  = nrow(X)
  p  = ncol(X)
  M  = 16
  e  = 100
  H  =  new(HnswL2, p, n, M, e)
  H$addItems(X)
  N = H$getAllNNs(X, k = k + 1)
  i = N[,1]
  j = N[,-1]
  Z = unique((j - i + (n - i / 2) * (i - 1))[j > i])
  W[-Z] = 0
  list(N = N, W = W, Z = sort(Z))
}


##' @title CCWeight
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param threads Number threads used for weight intialization
##' @export
CCWeight = function(X, threads = 4) {
  Xnorm = colSums(X**2)[1]
  CCWeight_Eigen(X, -1.0 * median(Xnorm), threads)
}


##' @title CCDataSim
##' @param C Number of clusters to be simulated (N)
##' @param r Radius of the circle which generate clusters
##' @param vars Standard deviation of each cluster
##' @param p Number of features of data
##' @param N Number of samples in each clusters
##' @param sparse Ratio of nonzero features of data
##' @param s Variance of nonzero feature
##' @param snr Signal-Noise-Ratio for zero feature
##' @import ggplot2
##' @export
CCDataSim = function(C = 4, r = 6, vars = c(0.5, 0.5, 0.5, 0.5), p = 200, N = 100, alphas = NULL, sparse = 0.1, s = 0.1, snr = 2, show = TRUE) {
  if (length(N) != C) {
    N = rep(N %/% C, C)
  }
  ## alphas
  if (is.null(alphas)) {
    alphas = rep(1, C)
  }
  ## initialize cluster center
  mat = matrix(nrow = C, ncol = 2)
  i = 1
  ## evenly distribute clusters on a circle (2D)
  for (k in seq(0, C - 1)) {
    x = cos(2 * pi * k / C) * r
    y = sin(2 * pi * k / C) * r
    mat[i, 1] = x * alphas[i]
    mat[i, 2] = y * alphas[i]
    i = i + 1
  }
  ## generate samples
  samplematrix = matrix(ncol = 2, nrow = 0)
  ## labels of data
  labels = vector("numeric", sum(N))
  c = 1
  for (j in seq(1, nrow(mat))) {
    Ns = N[j]
    sd = vars[j]
    for (sample in seq(1, Ns)) {
      newsample = mat[j, ] + rnorm(2, mean = 2, sd = sd)   ## sample nearby the center
      samplematrix = rbind(samplematrix, newsample)
      labels[c] = j
      c = c + 1
    }
  }
  np = p * sparse
  ## generate orthogonal vectors
  x  = rnorm(np, mean = 0, sd = s)
  yz = rnorm(np, mean = 0, sd = s)
  y  = yz - crossprod(yz, x)[1] * x / sum(x**2)
  res = matrix(nrow = nrow(samplematrix), ncol = p)
  for (i in seq(1, nrow(samplematrix), 1)) {
    a = samplematrix[i, 1]
    b = samplematrix[i, 2]
    sample = c(a * x[1:np] + b * y[1:np], rep(0, p - np))
    z = rnorm(p, mean = 0, sd = s / snr)
    sample = sample + z
    res[i, ] <- sample
  }
  data = res  ## n x p
  pca  = prcomp(data)
  scores = data.frame(pca$x)
  if (show) {
    plot <- ggplot(data = scores,
                aes_string(
                  x = "PC1",
                  y = "PC2",
                  colour = as.factor(labels)
                )) +
      geom_point(size = 3) + theme_bw() + theme(
        legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text.y = element_text(size = 12,
                                   colour = "black"),
        axis.text.x = element_text(size = 12,
                                   colour = "black"),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12)
      )
    print(plot)
  }
  list(data = data, labels = labels, features = c(rep(TRUE, np), rep(FALSE, p - np)))
}



##' @title NearestNeighbour
##' @param D List of distance
##' @param n Number of samples
##' @param type KNN or Epsilon-NN
##' @param k Average number of nearest neighbour
##' @return list
##'         Z index of non-zero entries in W
##'         W weight matrix with filtering
##' @export
NearestNeighbour = function(D, n, type = c("knn", "ann"), k = 5) {
  type = match.arg(type)
  IND = do.call(rbind, lapply(D$K, function(k) {
    c(k %/% n, k %% n) + 1
  }))
  if (type == "knn") {
    ix = NULL
    for(i in 1:n) {
      j = which(IND[,1] == i | IND[,2] == i)
      ix = c(ix, j[sort(D$D2[j], decreasing = FALSE, index.return = TRUE)$ix[1:k]])
    }
    ix = unique(ix)
  } else {
    Q = k * n / 2
    ix = which(D$D2 <= sort(D$D2, decreasing = FALSE)[Q])
  }
  W = D$W
  W[-ix] = 0
  list(Z = ix, W = W, D2 = D$D2, IND = IND)
}

##' @title PostKNNByOutlier
##' @param Z List of KNN
##' @param D List of distance
##' @param q epsilon for outlier cutoff. Default = 0.5
##' @return list
##'         Z index of non-zero entries in W
##'         W weight matrix with filtering
##' @export
PostKNNByOutlier = function(Z, D, q =  0.5) {
  t = quantile(D$W, seq(0, 1, length.out = 11))[q * 10 + 1]
  ix = (Z$W[Z$Z] >= t)     # weight kept
  jx = (Z$W[Z$Z] < t)      # weight remove
  ZZ = Z$Z[ix]
  W = Z$W
  W[Z$Z[jx]] = 0
  list(Z = ZZ, W = W, D2 = D$D2)
}


##' @title OptProxCC
##' @description If lambda = 0, ProxCC gives us n clusters, and if lambda -> lambda_max, ProxCC merges all samples into
##'              one cluster
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param U Initial of clustering examplers. The rows are features, and the columns are the samples. Default is NULL
##' @param Z Prefiltered weighted by K nearest neighbouring or ANN
##' @param nLambda A regularization parameter for cluster number within penalty term Lambda * \code{w[k]} * |U_{,i} - U_{,j}|_2
##' @param Gamma  A regularization parameter the number of nonzero features within penalty term Gamma * \code{r[k]} * |U_{k,}|_2
##' @param R The adaptive group lasso's weights for feature sparse penalty.
##' @param p Number of features
##' @param n Number of observations
##' @param tol tolerance for convergence
##' @param maxit max iterations
##' @param threads Number of Cpu threads for convex clustering used in OpenMP
##' @param verbose print details or not
##' @param warm Warm start scheme or not. Default FALSE
##' @param metric Optimize scheme for hyper-parameter. eBIC, Silhouette
##' @export
OptProxCC = function(X, U, Z, nLambda = 10, Gamma, p, n, R, tol, maxit, threads, verbose = TRUE, warm = FALSE, metric = c("eBIC", "Silhouette")) {
  metric = match.arg(metric)
  M = CCWeight(X, threads, type = "median")
  np = n * p
  LambdaR = n * sqrt(M$D) / M$W
  Lambda_min = min(LambdaR)
  Lambda_max = max(LambdaR)
  Lambdas = c(0, 10**seq(0, log10(Lambda_max / Lambda_min), length.out = nLambda) * Lambda_min)
  U = X * 0
  fit = list()
  vs  = NULL
  for (i in 1:length(Lambdas)) {
    if (verbose) {
      cat("Lambda = ", Lambdas[i], "... \n")
    }
    C = ProxSNN_Eigen(X = X, U = U, Z = Z$Z, Lambda = Lambdas[i], Gamma = Gamma, p = p, n = n, m = length(Z$Z), W = Z$W, R = R, tol = tol, maxit = maxit, verbose = verbose, threads = threads)
    fit[[i]] = C$U
    if (warm) {
      U = fit[[i]]
    }
    P = ProxSCC_CFinder(C$U, tol = tol)
    if (metric == "eBIC") {
      df  = max(P$A)
      RSS = norm(X - C$U, "F")**2
      eBIC = function(g) {
        np * log(RSS / np) + (2 * g + 1) + df * log(np)
      }
      vs = rbind(vs, c(Lambdas[i], sapply(seq(0, 1, by = 0.1), eBIC)))
    } else if (metric == "Silhouette") {
      CA = which(table(P$A) > 1)
      sil = sapply(which(P$A %in% CA), function(i) {
        c = which(P$A == P$A[i])
        a = sum(sqrt(Z$D2)[(Z$IND[,1] %in% c & Z$IND[,2] == i) | (Z$IND[,1] == i & Z$IND[,2] %in% c)]) / (length(c) - 1)
        b = NULL
        for (l in setdiff(unique(P$A), P$A[i])) {
          c = which(P$A == l)
          b = c(b, sum(sqrt(Z$D2)[(Z$IND[,1] %in% c & Z$IND[,2] == i) | (Z$IND[,1] == i & Z$IND[,2] %in% c)]) / length(c))
        }
        b = min(b)
        (b - a) / max(a, b)
      })
      vs = rbind(vs, c(Lambdas[i], mean(sil)))
    }
  }
  if (metric == "eBIC") {
    ix = which.min(vs[, 7])
    lambda.opt = Lambdas[ix]
  } else {
    ix = which.max(vs[, 2])
    lambda.opt = Lambdas[ix]
  }
  list(lambda = lambda.opt, gamma = Gamma, metric = vs, fit = fit[[ix]])
}




##' @title NN2
##' @param D List of distance
##' @param n Number of samples
##' @param type KNN or Epsilon-NN
##' @param k Average number of nearest neighbour
##' @param X raw obvserved Data matrix
##' @return list
##'         Z index of non-zero entries in W
##'         W weight matrix with filtering
##' @export
##' @importFrom FNN get.knn
NN2 = function(D, n, type = "knn", k = 50, X) {
  II = D$K %/% n
  IJ = D$K %% n
  IND = cbind(II, IJ) + 1
  NNi = get.knn(X, k = k)$nn.index
  ix = lapply(1:n, function(i) {
    j = NNi[i, ]
    r = (i < j)
    r * ((n - i / 2) * (i - 1) + (j - i)) + (1 - r) * ((n - j / 2) * (j - 1) + (i - j))
  })
  ix = unique(unlist(ix))
  W = D$W
  W[-ix] = 0
  list(Z = ix, W = W, D2 = D$D2, IND = IND)
}
Ivis4ml/PRScc documentation built on June 4, 2020, 9:19 a.m.