PRScc: Proximal Gradient Smoothing Sparse Convex Clustering

Documented in Ann4Cc CCDataSim CCWeight CFinder CMetric NearestNeighbour NN2 OptProxCC PostKNNByOutlier ProxSCC

### ProxSCC(Proximal Gradient Smoothing Sparse Convex Clustering) solver
##' @title ProxSCC
##' @description Implementing the ProxSCC algorithm for Convex Clustering
##'
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param U Initial of clustering examplers. The rows are features, and the columns are the samples. Default is NULL
##' @param Lambda A regularization parameter for cluster number within penalty term Lambda * \code{w[k]} * |U_{,i} - U_{,j}|_2
##' @param Gamma  A regularization parameter the number of nonzero features within penalty term Gamma * \code{r[k]} * |U_{k,}|_2
##' @param w A vector of nonegative weights. \code{w[k]} denotes the weight for the k-th pair of examplers.
##' @param r The adaptive group lasso's weights for feature sparse penalty.
##' @param p Rows of matrix \code{X}. nrow(X) (Default).
##' @param n Columns of matrix \code{X}. ncol(X) (Default).
##' @param tol The convergence threshold. Default 1e-5.
##' @param maxit The maximum iterations need for algorithms. Default 500.
##' @param verbose False as default.
##'
##' @return
##' \describe{
##' \item{U}{ Centroid matrix of ProxSCC}
##' \item{iters}{ number of iterations}
##' }
##'
##' @export
ProxSCC = function(X = NULL,
                   U = NULL,
                   Lambda,
                   Gamma,
                   w,
                   r,
                   p = nrow(X),
                   n = ncol(X),
                   tol = 1e-5,
                   maxit = 500,
                   verbose = FALSE) {
  if (is.null(X)) {
    stop("Data matrix X should not be NULL...")
  }
}


#' @title CFinder
#' @param U Matrix of Convex Clustering Output
#' @return L Vector of labels
#' @export
CFinder = function(U = NULL, r = 1e-2) {
  L = NULL
  C = NULL
  m = 1
  for (i in 1:ncol(U)) {
    if (m == 1 && i == 1) {
      C = U[, i, drop = FALSE]
      L = c(L, m)
    } else {
      D = apply((C - U[, i])^2, 2, sum)
      j = which(D < r)
      if (length(j) != 0) {
        L = c(L, min(j))
      } else {
        m = m + 1
        C = cbind(C, U[, i, drop = FALSE])
        L = c(L, m)
      }
    }
  }
  list(Center = C, Labels = L)
}


##' @title CMetric
##' @param U Matrix of Convex Clustering Output
##' @param C Cluster assignment for each observation
##' @param labels Labels of ground truth
##' @return
##' RandIndex
##' NMI
##' @importFrom mclust adjustedRandIndex
##' @importFrom NMI NMI
##' @importFrom Matrix Matrix
##' @export
CMetric = function(U, C, Labels, Feature, threshold) {
  RandIndex = adjustedRandIndex(C$Labels, Labels)
  N = seq(1, ncol(U))
  NMI = NMI(data.frame(N, C$Labels), data.frame(N, Labels))
  estFeature = (apply(C$Center**2, 1, sum) > threshold)
  esTD = table(Feature, estFeature)
  TPR = esTD[2, 2] / sum(Feature != 0)
  FPR = esTD[1, 2] / sum(Feature == 0)
  FNR = esTD[2, 1] / sum(Feature != 0)
  TNR = esTD[1, 1] / sum(Feature == 0)
  eMCC = MCC(TPR, FPR, TNR, FNR)
  FDR = esTD[1, 2] / sum(estFeature != 0)
  list(MeasLabel = c(RandIndex = RandIndex, NMI = NMI),
       MeasFeatr = c(MCC = eMCC, TPR = TPR, FDR = FDR, FNR = FNR, TNR = TNR)
  )
}


MCC = function(TPR, FPR, TNR, FNR) {
  P = log2(TPR + FPR) + log2(TPR + FNR) + log(TNR + FPR) + log2(TNR + FNR)
  (TPR * TNR - FPR * FNR) / sqrt(2**P)
}


##' @title Ann4Cc
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param k First k nearest neighbours of X
##' @return list of kNN
##' @importFrom RcppHNSW HnswL2
##' @export
Ann4Cc = function(X, k = 1, W) {
  n  = nrow(X)
  p  = ncol(X)
  M  = 16
  e  = 100
  H  =  new(HnswL2, p, n, M, e)
  H$addItems(X)
  N = H$getAllNNs(X, k = k + 1)
  i = N[,1]
  j = N[,-1]
  Z = unique((j - i + (n - i / 2) * (i - 1))[j > i])
  W[-Z] = 0
  list(N = N, W = W, Z = sort(Z))
}


##' @title CCWeight
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param threads Number threads used for weight intialization
##' @export
CCWeight = function(X, threads = 4) {
  Xnorm = colSums(X**2)[1]
  CCWeight_Eigen(X, -1.0 * median(Xnorm), threads)
}


##' @title CCDataSim
##' @param C Number of clusters to be simulated (N)
##' @param r Radius of the circle which generate clusters
##' @param vars Standard deviation of each cluster
##' @param p Number of features of data
##' @param N Number of samples in each clusters
##' @param sparse Ratio of nonzero features of data
##' @param s Variance of nonzero feature
##' @param snr Signal-Noise-Ratio for zero feature
##' @import ggplot2
##' @export
CCDataSim = function(C = 4, r = 6, vars = c(0.5, 0.5, 0.5, 0.5), p = 200, N = 100, alphas = NULL, sparse = 0.1, s = 0.1, snr = 2, show = TRUE) {
  if (length(N) != C) {
    N = rep(N %/% C, C)
  }
  ## alphas
  if (is.null(alphas)) {
    alphas = rep(1, C)
  }
  ## initialize cluster center
  mat = matrix(nrow = C, ncol = 2)
  i = 1
  ## evenly distribute clusters on a circle (2D)
  for (k in seq(0, C - 1)) {
    x = cos(2 * pi * k / C) * r
    y = sin(2 * pi * k / C) * r
    mat[i, 1] = x * alphas[i]
    mat[i, 2] = y * alphas[i]
    i = i + 1
  }
  ## generate samples
  samplematrix = matrix(ncol = 2, nrow = 0)
  ## labels of data
  labels = vector("numeric", sum(N))
  c = 1
  for (j in seq(1, nrow(mat))) {
    Ns = N[j]
    sd = vars[j]
    for (sample in seq(1, Ns)) {
      newsample = mat[j, ] + rnorm(2, mean = 2, sd = sd)   ## sample nearby the center
      samplematrix = rbind(samplematrix, newsample)
      labels[c] = j
      c = c + 1
    }
  }
  np = p * sparse
  ## generate orthogonal vectors
  x  = rnorm(np, mean = 0, sd = s)
  yz = rnorm(np, mean = 0, sd = s)
  y  = yz - crossprod(yz, x)[1] * x / sum(x**2)
  res = matrix(nrow = nrow(samplematrix), ncol = p)
  for (i in seq(1, nrow(samplematrix), 1)) {
    a = samplematrix[i, 1]
    b = samplematrix[i, 2]
    sample = c(a * x[1:np] + b * y[1:np], rep(0, p - np))
    z = rnorm(p, mean = 0, sd = s / snr)
    sample = sample + z
    res[i, ] <- sample
  }
  data = res  ## n x p
  pca  = prcomp(data)
  scores = data.frame(pca$x)
  if (show) {
    plot <- ggplot(data = scores,
                aes_string(
                  x = "PC1",
                  y = "PC2",
                  colour = as.factor(labels)
                )) +
      geom_point(size = 3) + theme_bw() + theme(
        legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text.y = element_text(size = 12,
                                   colour = "black"),
        axis.text.x = element_text(size = 12,
                                   colour = "black"),
        axis.title.x = element_text(size = 12),
        axis.title.y = element_text(size = 12)
      )
    print(plot)
  }
  list(data = data, labels = labels, features = c(rep(TRUE, np), rep(FALSE, p - np)))
}



##' @title NearestNeighbour
##' @param D List of distance
##' @param n Number of samples
##' @param type KNN or Epsilon-NN
##' @param k Average number of nearest neighbour
##' @return list
##'         Z index of non-zero entries in W
##'         W weight matrix with filtering
##' @export
NearestNeighbour = function(D, n, type = c("knn", "ann"), k = 5) {
  type = match.arg(type)
  IND = do.call(rbind, lapply(D$K, function(k) {
    c(k %/% n, k %% n) + 1
  }))
  if (type == "knn") {
    ix = NULL
    for(i in 1:n) {
      j = which(IND[,1] == i | IND[,2] == i)
      ix = c(ix, j[sort(D$D2[j], decreasing = FALSE, index.return = TRUE)$ix[1:k]])
    }
    ix = unique(ix)
  } else {
    Q = k * n / 2
    ix = which(D$D2 <= sort(D$D2, decreasing = FALSE)[Q])
  }
  W = D$W
  W[-ix] = 0
  list(Z = ix, W = W, D2 = D$D2, IND = IND)
}

##' @title PostKNNByOutlier
##' @param Z List of KNN
##' @param D List of distance
##' @param q epsilon for outlier cutoff. Default = 0.5
##' @return list
##'         Z index of non-zero entries in W
##'         W weight matrix with filtering
##' @export
PostKNNByOutlier = function(Z, D, q =  0.5) {
  t = quantile(D$W, seq(0, 1, length.out = 11))[q * 10 + 1]
  ix = (Z$W[Z$Z] >= t)     # weight kept
  jx = (Z$W[Z$Z] < t)      # weight remove
  ZZ = Z$Z[ix]
  W = Z$W
  W[Z$Z[jx]] = 0
  list(Z = ZZ, W = W, D2 = D$D2)
}


##' @title OptProxCC
##' @description If lambda = 0, ProxCC gives us n clusters, and if lambda -> lambda_max, ProxCC merges all samples into
##'              one cluster
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param U Initial of clustering examplers. The rows are features, and the columns are the samples. Default is NULL
##' @param Z Prefiltered weighted by K nearest neighbouring or ANN
##' @param nLambda A regularization parameter for cluster number within penalty term Lambda * \code{w[k]} * |U_{,i} - U_{,j}|_2
##' @param Gamma  A regularization parameter the number of nonzero features within penalty term Gamma * \code{r[k]} * |U_{k,}|_2
##' @param R The adaptive group lasso's weights for feature sparse penalty.
##' @param p Number of features
##' @param n Number of observations
##' @param tol tolerance for convergence
##' @param maxit max iterations
##' @param threads Number of Cpu threads for convex clustering used in OpenMP
##' @param verbose print details or not
##' @param warm Warm start scheme or not. Default FALSE
##' @param metric Optimize scheme for hyper-parameter. eBIC, Silhouette
##' @export
OptProxCC = function(X, U, Z, nLambda = 10, Gamma, p, n, R, tol, maxit, threads, verbose = TRUE, warm = FALSE, metric = c("eBIC", "Silhouette")) {
  metric = match.arg(metric)
  M = CCWeight(X, threads, type = "median")
  np = n * p
  LambdaR = n * sqrt(M$D) / M$W
  Lambda_min = min(LambdaR)
  Lambda_max = max(LambdaR)
  Lambdas = c(0, 10**seq(0, log10(Lambda_max / Lambda_min), length.out = nLambda) * Lambda_min)
  U = X * 0
  fit = list()
  vs  = NULL
  for (i in 1:length(Lambdas)) {
    if (verbose) {
      cat("Lambda = ", Lambdas[i], "... \n")
    }
    C = ProxSNN_Eigen(X = X, U = U, Z = Z$Z, Lambda = Lambdas[i], Gamma = Gamma, p = p, n = n, m = length(Z$Z), W = Z$W, R = R, tol = tol, maxit = maxit, verbose = verbose, threads = threads)
    fit[[i]] = C$U
    if (warm) {
      U = fit[[i]]
    }
    P = ProxSCC_CFinder(C$U, tol = tol)
    if (metric == "eBIC") {
      df  = max(P$A)
      RSS = norm(X - C$U, "F")**2
      eBIC = function(g) {
        np * log(RSS / np) + (2 * g + 1) + df * log(np)
      }
      vs = rbind(vs, c(Lambdas[i], sapply(seq(0, 1, by = 0.1), eBIC)))
    } else if (metric == "Silhouette") {
      CA = which(table(P$A) > 1)
      sil = sapply(which(P$A %in% CA), function(i) {
        c = which(P$A == P$A[i])
        a = sum(sqrt(Z$D2)[(Z$IND[,1] %in% c & Z$IND[,2] == i) | (Z$IND[,1] == i & Z$IND[,2] %in% c)]) / (length(c) - 1)
        b = NULL
        for (l in setdiff(unique(P$A), P$A[i])) {
          c = which(P$A == l)
          b = c(b, sum(sqrt(Z$D2)[(Z$IND[,1] %in% c & Z$IND[,2] == i) | (Z$IND[,1] == i & Z$IND[,2] %in% c)]) / length(c))
        }
        b = min(b)
        (b - a) / max(a, b)
      })
      vs = rbind(vs, c(Lambdas[i], mean(sil)))
    }
  }
  if (metric == "eBIC") {
    ix = which.min(vs[, 7])
    lambda.opt = Lambdas[ix]
  } else {
    ix = which.max(vs[, 2])
    lambda.opt = Lambdas[ix]
  }
  list(lambda = lambda.opt, gamma = Gamma, metric = vs, fit = fit[[ix]])
}




##' @title NN2
##' @param D List of distance
##' @param n Number of samples
##' @param type KNN or Epsilon-NN
##' @param k Average number of nearest neighbour
##' @param X raw obvserved Data matrix
##' @return list
##'         Z index of non-zero entries in W
##'         W weight matrix with filtering
##' @export
##' @importFrom FNN get.knn
NN2 = function(D, n, type = "knn", k = 50, X) {
  II = D$K %/% n
  IJ = D$K %% n
  IND = cbind(II, IJ) + 1
  NNi = get.knn(X, k = k)$nn.index
  ix = lapply(1:n, function(i) {
    j = NNi[i, ]
    r = (i < j)
    r * ((n - i / 2) * (i - 1) + (j - i)) + (1 - r) * ((n - j / 2) * (j - 1) + (i - j))
  })
  ix = unique(unlist(ix))
  W = D$W
  W[-ix] = 0
  list(Z = ix, W = W, D2 = D$D2, IND = IND)
}

Ivis4ml/PRScc documentation built on June 4, 2020, 9:19 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Ivis4ml/PRScc
Proximal Gradient Smoothing Sparse Convex Clustering

R/PRSCC-methods.R
In Ivis4ml/PRScc: Proximal Gradient Smoothing Sparse Convex Clustering

Defines functions NN2 OptProxCC PostKNNByOutlier NearestNeighbour CCDataSim CCWeight Ann4Cc MCC CMetric CFinder ProxSCC

Documented in Ann4Cc CCDataSim CCWeight CFinder CMetric NearestNeighbour NN2 OptProxCC PostKNNByOutlier ProxSCC

R Package Documentation

Browse R Packages

We want your feedback!

Ivis4ml/PRScc Proximal Gradient Smoothing Sparse Convex Clustering

R/PRSCC-methods.R In Ivis4ml/PRScc: Proximal Gradient Smoothing Sparse Convex Clustering

Defines functions NN2 OptProxCC PostKNNByOutlier NearestNeighbour CCDataSim CCWeight Ann4Cc MCC CMetric CFinder ProxSCC

Documented in Ann4Cc CCDataSim CCWeight CFinder CMetric NearestNeighbour NN2 OptProxCC PostKNNByOutlier ProxSCC

R Package Documentation

Browse R Packages

We want your feedback!

Ivis4ml/PRScc
Proximal Gradient Smoothing Sparse Convex Clustering

R/PRSCC-methods.R
In Ivis4ml/PRScc: Proximal Gradient Smoothing Sparse Convex Clustering