### ProxSCC(Proximal Gradient Smoothing Sparse Convex Clustering) solver
##' @title ProxSCC
##' @description Implementing the ProxSCC algorithm for Convex Clustering
##'
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param U Initial of clustering examplers. The rows are features, and the columns are the samples. Default is NULL
##' @param Lambda A regularization parameter for cluster number within penalty term Lambda * \code{w[k]} * |U_{,i} - U_{,j}|_2
##' @param Gamma A regularization parameter the number of nonzero features within penalty term Gamma * \code{r[k]} * |U_{k,}|_2
##' @param w A vector of nonegative weights. \code{w[k]} denotes the weight for the k-th pair of examplers.
##' @param r The adaptive group lasso's weights for feature sparse penalty.
##' @param p Rows of matrix \code{X}. nrow(X) (Default).
##' @param n Columns of matrix \code{X}. ncol(X) (Default).
##' @param tol The convergence threshold. Default 1e-5.
##' @param maxit The maximum iterations need for algorithms. Default 500.
##' @param verbose False as default.
##'
##' @return
##' \describe{
##' \item{U}{ Centroid matrix of ProxSCC}
##' \item{iters}{ number of iterations}
##' }
##'
##' @export
ProxSCC = function(X = NULL,
U = NULL,
Lambda,
Gamma,
w,
r,
p = nrow(X),
n = ncol(X),
tol = 1e-5,
maxit = 500,
verbose = FALSE) {
if (is.null(X)) {
stop("Data matrix X should not be NULL...")
}
}
#' @title CFinder
#' @param U Matrix of Convex Clustering Output
#' @return L Vector of labels
#' @export
CFinder = function(U = NULL, r = 1e-2) {
L = NULL
C = NULL
m = 1
for (i in 1:ncol(U)) {
if (m == 1 && i == 1) {
C = U[, i, drop = FALSE]
L = c(L, m)
} else {
D = apply((C - U[, i])^2, 2, sum)
j = which(D < r)
if (length(j) != 0) {
L = c(L, min(j))
} else {
m = m + 1
C = cbind(C, U[, i, drop = FALSE])
L = c(L, m)
}
}
}
list(Center = C, Labels = L)
}
##' @title CMetric
##' @param U Matrix of Convex Clustering Output
##' @param C Cluster assignment for each observation
##' @param labels Labels of ground truth
##' @return
##' RandIndex
##' NMI
##' @importFrom mclust adjustedRandIndex
##' @importFrom NMI NMI
##' @importFrom Matrix Matrix
##' @export
CMetric = function(U, C, Labels, Feature, threshold) {
RandIndex = adjustedRandIndex(C$Labels, Labels)
N = seq(1, ncol(U))
NMI = NMI(data.frame(N, C$Labels), data.frame(N, Labels))
estFeature = (apply(C$Center**2, 1, sum) > threshold)
esTD = table(Feature, estFeature)
TPR = esTD[2, 2] / sum(Feature != 0)
FPR = esTD[1, 2] / sum(Feature == 0)
FNR = esTD[2, 1] / sum(Feature != 0)
TNR = esTD[1, 1] / sum(Feature == 0)
eMCC = MCC(TPR, FPR, TNR, FNR)
FDR = esTD[1, 2] / sum(estFeature != 0)
list(MeasLabel = c(RandIndex = RandIndex, NMI = NMI),
MeasFeatr = c(MCC = eMCC, TPR = TPR, FDR = FDR, FNR = FNR, TNR = TNR)
)
}
MCC = function(TPR, FPR, TNR, FNR) {
P = log2(TPR + FPR) + log2(TPR + FNR) + log(TNR + FPR) + log2(TNR + FNR)
(TPR * TNR - FPR * FNR) / sqrt(2**P)
}
##' @title Ann4Cc
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param k First k nearest neighbours of X
##' @return list of kNN
##' @importFrom RcppHNSW HnswL2
##' @export
Ann4Cc = function(X, k = 1, W) {
n = nrow(X)
p = ncol(X)
M = 16
e = 100
H = new(HnswL2, p, n, M, e)
H$addItems(X)
N = H$getAllNNs(X, k = k + 1)
i = N[,1]
j = N[,-1]
Z = unique((j - i + (n - i / 2) * (i - 1))[j > i])
W[-Z] = 0
list(N = N, W = W, Z = sort(Z))
}
##' @title CCWeight
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param threads Number threads used for weight intialization
##' @export
CCWeight = function(X, threads = 4) {
Xnorm = colSums(X**2)[1]
CCWeight_Eigen(X, -1.0 * median(Xnorm), threads)
}
##' @title CCDataSim
##' @param C Number of clusters to be simulated (N)
##' @param r Radius of the circle which generate clusters
##' @param vars Standard deviation of each cluster
##' @param p Number of features of data
##' @param N Number of samples in each clusters
##' @param sparse Ratio of nonzero features of data
##' @param s Variance of nonzero feature
##' @param snr Signal-Noise-Ratio for zero feature
##' @import ggplot2
##' @export
CCDataSim = function(C = 4, r = 6, vars = c(0.5, 0.5, 0.5, 0.5), p = 200, N = 100, alphas = NULL, sparse = 0.1, s = 0.1, snr = 2, show = TRUE) {
if (length(N) != C) {
N = rep(N %/% C, C)
}
## alphas
if (is.null(alphas)) {
alphas = rep(1, C)
}
## initialize cluster center
mat = matrix(nrow = C, ncol = 2)
i = 1
## evenly distribute clusters on a circle (2D)
for (k in seq(0, C - 1)) {
x = cos(2 * pi * k / C) * r
y = sin(2 * pi * k / C) * r
mat[i, 1] = x * alphas[i]
mat[i, 2] = y * alphas[i]
i = i + 1
}
## generate samples
samplematrix = matrix(ncol = 2, nrow = 0)
## labels of data
labels = vector("numeric", sum(N))
c = 1
for (j in seq(1, nrow(mat))) {
Ns = N[j]
sd = vars[j]
for (sample in seq(1, Ns)) {
newsample = mat[j, ] + rnorm(2, mean = 2, sd = sd) ## sample nearby the center
samplematrix = rbind(samplematrix, newsample)
labels[c] = j
c = c + 1
}
}
np = p * sparse
## generate orthogonal vectors
x = rnorm(np, mean = 0, sd = s)
yz = rnorm(np, mean = 0, sd = s)
y = yz - crossprod(yz, x)[1] * x / sum(x**2)
res = matrix(nrow = nrow(samplematrix), ncol = p)
for (i in seq(1, nrow(samplematrix), 1)) {
a = samplematrix[i, 1]
b = samplematrix[i, 2]
sample = c(a * x[1:np] + b * y[1:np], rep(0, p - np))
z = rnorm(p, mean = 0, sd = s / snr)
sample = sample + z
res[i, ] <- sample
}
data = res ## n x p
pca = prcomp(data)
scores = data.frame(pca$x)
if (show) {
plot <- ggplot(data = scores,
aes_string(
x = "PC1",
y = "PC2",
colour = as.factor(labels)
)) +
geom_point(size = 3) + theme_bw() + theme(
legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_text(size = 12,
colour = "black"),
axis.text.x = element_text(size = 12,
colour = "black"),
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12)
)
print(plot)
}
list(data = data, labels = labels, features = c(rep(TRUE, np), rep(FALSE, p - np)))
}
##' @title NearestNeighbour
##' @param D List of distance
##' @param n Number of samples
##' @param type KNN or Epsilon-NN
##' @param k Average number of nearest neighbour
##' @return list
##' Z index of non-zero entries in W
##' W weight matrix with filtering
##' @export
NearestNeighbour = function(D, n, type = c("knn", "ann"), k = 5) {
type = match.arg(type)
IND = do.call(rbind, lapply(D$K, function(k) {
c(k %/% n, k %% n) + 1
}))
if (type == "knn") {
ix = NULL
for(i in 1:n) {
j = which(IND[,1] == i | IND[,2] == i)
ix = c(ix, j[sort(D$D2[j], decreasing = FALSE, index.return = TRUE)$ix[1:k]])
}
ix = unique(ix)
} else {
Q = k * n / 2
ix = which(D$D2 <= sort(D$D2, decreasing = FALSE)[Q])
}
W = D$W
W[-ix] = 0
list(Z = ix, W = W, D2 = D$D2, IND = IND)
}
##' @title PostKNNByOutlier
##' @param Z List of KNN
##' @param D List of distance
##' @param q epsilon for outlier cutoff. Default = 0.5
##' @return list
##' Z index of non-zero entries in W
##' W weight matrix with filtering
##' @export
PostKNNByOutlier = function(Z, D, q = 0.5) {
t = quantile(D$W, seq(0, 1, length.out = 11))[q * 10 + 1]
ix = (Z$W[Z$Z] >= t) # weight kept
jx = (Z$W[Z$Z] < t) # weight remove
ZZ = Z$Z[ix]
W = Z$W
W[Z$Z[jx]] = 0
list(Z = ZZ, W = W, D2 = D$D2)
}
##' @title OptProxCC
##' @description If lambda = 0, ProxCC gives us n clusters, and if lambda -> lambda_max, ProxCC merges all samples into
##' one cluster
##' @param X Data matrix to be clustered. The rows are features, and the columns are the samples
##' @param U Initial of clustering examplers. The rows are features, and the columns are the samples. Default is NULL
##' @param Z Prefiltered weighted by K nearest neighbouring or ANN
##' @param nLambda A regularization parameter for cluster number within penalty term Lambda * \code{w[k]} * |U_{,i} - U_{,j}|_2
##' @param Gamma A regularization parameter the number of nonzero features within penalty term Gamma * \code{r[k]} * |U_{k,}|_2
##' @param R The adaptive group lasso's weights for feature sparse penalty.
##' @param p Number of features
##' @param n Number of observations
##' @param tol tolerance for convergence
##' @param maxit max iterations
##' @param threads Number of Cpu threads for convex clustering used in OpenMP
##' @param verbose print details or not
##' @param warm Warm start scheme or not. Default FALSE
##' @param metric Optimize scheme for hyper-parameter. eBIC, Silhouette
##' @export
OptProxCC = function(X, U, Z, nLambda = 10, Gamma, p, n, R, tol, maxit, threads, verbose = TRUE, warm = FALSE, metric = c("eBIC", "Silhouette")) {
metric = match.arg(metric)
M = CCWeight(X, threads, type = "median")
np = n * p
LambdaR = n * sqrt(M$D) / M$W
Lambda_min = min(LambdaR)
Lambda_max = max(LambdaR)
Lambdas = c(0, 10**seq(0, log10(Lambda_max / Lambda_min), length.out = nLambda) * Lambda_min)
U = X * 0
fit = list()
vs = NULL
for (i in 1:length(Lambdas)) {
if (verbose) {
cat("Lambda = ", Lambdas[i], "... \n")
}
C = ProxSNN_Eigen(X = X, U = U, Z = Z$Z, Lambda = Lambdas[i], Gamma = Gamma, p = p, n = n, m = length(Z$Z), W = Z$W, R = R, tol = tol, maxit = maxit, verbose = verbose, threads = threads)
fit[[i]] = C$U
if (warm) {
U = fit[[i]]
}
P = ProxSCC_CFinder(C$U, tol = tol)
if (metric == "eBIC") {
df = max(P$A)
RSS = norm(X - C$U, "F")**2
eBIC = function(g) {
np * log(RSS / np) + (2 * g + 1) + df * log(np)
}
vs = rbind(vs, c(Lambdas[i], sapply(seq(0, 1, by = 0.1), eBIC)))
} else if (metric == "Silhouette") {
CA = which(table(P$A) > 1)
sil = sapply(which(P$A %in% CA), function(i) {
c = which(P$A == P$A[i])
a = sum(sqrt(Z$D2)[(Z$IND[,1] %in% c & Z$IND[,2] == i) | (Z$IND[,1] == i & Z$IND[,2] %in% c)]) / (length(c) - 1)
b = NULL
for (l in setdiff(unique(P$A), P$A[i])) {
c = which(P$A == l)
b = c(b, sum(sqrt(Z$D2)[(Z$IND[,1] %in% c & Z$IND[,2] == i) | (Z$IND[,1] == i & Z$IND[,2] %in% c)]) / length(c))
}
b = min(b)
(b - a) / max(a, b)
})
vs = rbind(vs, c(Lambdas[i], mean(sil)))
}
}
if (metric == "eBIC") {
ix = which.min(vs[, 7])
lambda.opt = Lambdas[ix]
} else {
ix = which.max(vs[, 2])
lambda.opt = Lambdas[ix]
}
list(lambda = lambda.opt, gamma = Gamma, metric = vs, fit = fit[[ix]])
}
##' @title NN2
##' @param D List of distance
##' @param n Number of samples
##' @param type KNN or Epsilon-NN
##' @param k Average number of nearest neighbour
##' @param X raw obvserved Data matrix
##' @return list
##' Z index of non-zero entries in W
##' W weight matrix with filtering
##' @export
##' @importFrom FNN get.knn
NN2 = function(D, n, type = "knn", k = 50, X) {
II = D$K %/% n
IJ = D$K %% n
IND = cbind(II, IJ) + 1
NNi = get.knn(X, k = k)$nn.index
ix = lapply(1:n, function(i) {
j = NNi[i, ]
r = (i < j)
r * ((n - i / 2) * (i - 1) + (j - i)) + (1 - r) * ((n - j / 2) * (j - 1) + (i - j))
})
ix = unique(unlist(ix))
W = D$W
W[-ix] = 0
list(Z = ix, W = W, D2 = D$D2, IND = IND)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.