Nothing
#' perform the SIMLR clustering algorithm for large scale datasets
#'
#' @title SIMLR Large Scale
#'
#' @examples
#' resized = ZeiselAmit$in_X[, 1:340]
#' \dontrun{
#' SIMLR_Large_Scale(X = resized, c = ZeiselAmit$n_clust, k = 5, kk = 5)
#' }
#'
#' @param X an (m x n) data matrix of gene expression measurements of individual cells or
#' and object of class SCESet
#' @param c number of clusters to be estimated over X
#' @param k tuning parameter
#' @param kk number of principal components to be assessed in the PCA
#' @param if.impute should I traspose the input data?
#' @param normalize should I normalize the input data?
#'
#' @return clusters the cells based on SIMLR Large Scale and their similarities
#' @return list of 8 elements describing the clusters obtained by SIMLR, of which y are the resulting clusters:
#' y = results of k-means clusterings,
#' S0 = similarities computed by SIMLR,
#' F = results from the large scale iterative procedure,
#' ydata = data referring the the results by k-means,
#' alphaK = clustering coefficients,
#' val = distances from the k-nearest neighbour search,
#' ind = indeces from the k-nearest neighbour search,
#' execution.time = execution time of the present run
#'
#' @export SIMLR_Large_Scale
#' @importFrom stats dnorm kmeans pbeta rnorm
#' @importFrom methods is new
#' @import Matrix
#' @import Rcpp
#' @importFrom pracma orth
#' @import RcppAnnoy
#' @importFrom RSpectra eigs_sym
#' @useDynLib SIMLR projsplx
#'
"SIMLR_Large_Scale" <- function( X, c, k = 10, kk = 100, if.impute = FALSE, normalize = FALSE ) {
# convert SCESet
if (is(X, "SCESet")) {
cat("X is and SCESet, converting to input matrix.\n")
X = X@assayData$exprs
}
# check the if.impute parameter
if(if.impute == TRUE){
X = t(X)
X_zeros = which(X==0,arr.ind=TRUE)
if(length(X_zeros)>0) {
R_zeros = as.vector(X_zeros[,"row"])
C_zeros = as.vector(X_zeros[,"col"])
ind = (C_zeros - 1) * nrow(X) + R_zeros
X[ind] = as.vector(colMeans(X))[C_zeros]
}
X = t(X)
}
# check the normalize parameter
if(normalize == TRUE){
X = t(X)
X = X - min(as.vector(X))
X = X / max(as.vector(X))
C_mean = as.vector(colMeans(X))
X = apply(X,MARGIN=1,FUN=function(x) return(x-C_mean))
}
# start the clock to measure the execution time
ptm = proc.time()
# set some parameters
NITER = 5
num = ncol(X)
r = -1
beta = 0.8
cat("Performing fast PCA.\n")
fast.pca_res = fast.pca(X,kk)
cat("Performing k-nearest neighbour search.\n")
a_annoy = new(AnnoyEuclidean,dim(fast.pca_res)[2])
n_annoy = dim(fast.pca_res)[1]
for (i_annoy in seq(n_annoy)) {
v_annoy = as.vector(fast.pca_res[i_annoy,])
a_annoy$addItem(i_annoy-1,v_annoy)
}
a_annoy$build(200)
val = array(0,c(dim(fast.pca_res)[1],k*2))
ind = array(0,c(dim(fast.pca_res)[1],k*2))
for(j_annoy in 1:dim(val)[1]) {
ind[j_annoy,] = a_annoy$getNNsByItem(j_annoy-1,k*2)+1
for(val_annoy in 1:dim(val)[2]) {
val[j_annoy,val_annoy] = a_annoy$getDistance(j_annoy-1,ind[j_annoy,val_annoy]-1)
}
}
cat("Computing the multiple Kernels.\n")
# compute the kernels
D_Kernels = multiple.kernel_large_scale(val,ind,k)
# set up some parameters
alphaK = 1 / rep(length(D_Kernels),length(D_Kernels))
distX = array(0,c(dim(D_Kernels[[1]])[1],dim(D_Kernels[[1]])[2]))
for (i in 1:length(D_Kernels)) {
distX = distX + D_Kernels[[i]]
}
distX = distX / length(D_Kernels)
di = distX[,2:(k+2)]
rr = 0.5 * (k * di[,k+1] - apply(di[,1:k],MARGIN=1,FUN=sum))
if(r<=0) {
r = mean(rr)
}
lambda = max(mean(rr),0)
S0 = max(max(distX)) - distX
# compute dn
S0 = dn_large_scale(S0,'ave')
S0_sparse = sparseMatrix(i=as.vector(matrix(rep(1:nrow(ind),ncol(ind)))),j=as.vector(ind),x=as.vector(S0),dims=c(nrow(ind),nrow(ind)))
eig_res = eigs_sym(S0_sparse+t(S0_sparse),c,which="LM")
F_eig = Re(eig_res$vectors)
eig_res = Re(eig_res$values)/max(Re(eig_res$values))
eig_res = (1-beta)*eig_res / (1-beta*eig_res^2)
tmp_eig = array(0,c(length(eig_res),length(eig_res)))
diag(tmp_eig) = eig_res
F_eig = F_eig %*% tmp_eig
F_eig = dn_large_scale(F_eig,'ave')
F_eig_initial = F_eig
cat("Performing the iterative procedure ",NITER," times.\n")
# perform the iterative procedure NITER times
for(iter in 1:NITER) {
cat("Iteration: ",iter,"\n")
distf = L2_distance_large_scale(F_eig,ind)
ad = (distX+lambda*distf)/2/r
# call the c function for the optimization
c_input = -t(ad)
c_output = t(ad)
ad = t(.Call("projsplx",c_input,c_output))
S0 = (1 - beta) * S0 + beta * ad
S0_sparse = sparseMatrix(i=as.vector(matrix(rep(1:nrow(ind),ncol(ind)))),j=as.vector(ind),x=as.vector(S0),dims=c(nrow(ind),nrow(ind)))
eig_res = eigs_sym(S0_sparse+t(S0_sparse),c,which="LM")
F_eig = Re(eig_res$vectors)
eig_res = Re(eig_res$values)/max(Re(eig_res$values))
eig_res = (1-beta)*eig_res / (1-beta*eig_res^2)
tmp_eig = array(0,c(length(eig_res),length(eig_res)))
diag(tmp_eig) = eig_res
F_eig = F_eig %*% tmp_eig
F_eig = dn_large_scale(F_eig,'ave')
F_eig = (1 - beta) * F_eig_initial + beta * F_eig
F_eig_initial = F_eig
DD = vector()
for (i in 1:length(D_Kernels)) {
temp = (.Machine$double.eps+D_Kernels[[i]]) * (S0+.Machine$double.eps)
DD[i] = mean(apply(temp,MARGIN=2,FUN=sum))
}
alphaK0 = umkl(DD)
alphaK0 = alphaK0 / sum(alphaK0)
alphaK = (1-beta) * alphaK + beta * alphaK0
alphaK = alphaK / sum(alphaK)
lambda = 1.5 * lambda
r = r / 1.01
# compute Kbeta
distX = D_Kernels[[1]] * alphaK[1]
for (i in 2:length(D_Kernels)) {
distX = distX + as.matrix(D_Kernels[[i]]) * alphaK[i]
}
}
# compute the execution time
execution.time = proc.time() - ptm
cat("Performing Kmeans.\n")
y = kmeans(F_eig,c,nstart=200)
cat("Performing t-SNE.\n")
I = as.vector(seq(1,ncol(ind)*nrow(ind)+1,ncol(ind)))
J = as.vector(t(ind))
V = as.vector(t(S0))
ydata = Rtsne(I,J,V)$Y
# create the structure with the results
results = list()
results[["y"]] = y
results[["S0"]] = S0
results[["F"]] = F_eig
results[["ydata"]] = ydata
results[["alphaK"]] = alphaK
results[["val"]] = val
results[["ind"]] = ind
results[["execution.time"]] = execution.time
return(results)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.