gendata <- function(n, p = 1000, q=20, seed=1, tau=10){
set.seed(1)
Z <- matrix(rnorm(p*q), p, q)
V <- qr.Q(qr(Z))
t(V)%*%V
lambdavec <- 1 - ((1:q)-1)/ p
Lambda <- diag(lambdavec)
set.seed(seed)
U <- matrix(rnorm(n*q), n, q)
tE <- matrix(rnorm(n*p), n, p) / tau
X <- U%*%Lambda%*% t(V) + tE
return(list(X = X, U=U))
}
# Metrics -----------------------------------------------------------------
#-- assist to evaluate reconstruction error
recerr2 <- function(tsdata,trdata,alpha,kern){
# x <- tsdata
# data <- trdata
ns <- nrow(tsdata)
d <- ncol(alpha)
if(!inherits(tsdata,'matrix')) tsdata <- as.matrix(tsdata)
if(!inherits(trdata,'matrix')) trdata <- as.matrix(trdata)
K <- kernelMatrix(kern, as.matrix(trdata))
Krow <- apply(K,2, mean)
Ksum <- mean(Krow)
sumalpha <- apply(alpha, 2, sum)
alphaKrow <- Krow %*% alpha
kmat = kernelMatrix(kern, trdata,tsdata)
f <- t(kmat) %*% (alpha) - matrix(apply(kmat, 2,mean)- Ksum, ncol=1)%*% t(sumalpha) -
matrix(alphaKrow, nrow=ns, ncol=d, byrow=TRUE)
f <- as.matrix(f)
res <- diag(kernelMatrix(kern, tsdata,tsdata)) - 2*apply(kmat, 2, mean) + Ksum - apply(f*Conj(f),1, sum)
return(as.vector(res))
}
mat2orth <- function(mat){
# mat <- R0
qr1 <- qr(mat)
Q <- qr.Q(qr1)
return(Q * sqrt(nrow(mat)))
}
colSD <- function(xmat, na.rm=TRUE){
apply(xmat, 2, sd, na.rm=na.rm)
}
###
dis_fun <- function(H, H0){
tr_fun <- function(x) sum(diag(x))
mat1 <- t(H0) %*% H %*% ginv(t(H) %*% H) %*% t(H) %*% H0
val <- 1- tr_fun(mat1)/tr_fun(t(H0) %*% H0)
return(val)
}
normvec_fun <- function(mu, hmu){
mean(abs(mu-hmu))
}
project.err <- function(H, H0){
require(MASS)
dH <- H0- H %*% ginv(t(H) %*% H) %*% t(H) %*% H0
sqrt(sum(dH^2))/ sqrt(sum(H0^2))
}
#### Clustering metric
cluster_metric <- function(hy, y, type=c('ARI', 'NMI') ){
require(mclust)
require(aricode)
type <- match.arg(type)
switch(type,
ARI= adjustedRandIndex(hy, y),
NMI = NMI(as.vector(hy), y))
}
classific_metric <- function(hy, y, type=c('F1score', 'ACC', 'Precision','Recall' )){
library(Metrics)
library(MLmetrics)
type <- match.arg(type)
## There are many metrics in this package
switch(type,
F1score= F1_Score(as.vector(hy), y),
ACC = Accuracy(as.vector(hy), y),
Precision=Precision(as.vector(hy), y),
Recall = Recall(as.vector(hy), y))
}
get_r2_mcfadden <- function(embeds, y){
library(nnet)
library(performance)
dat <- NULL
y <- as.numeric(as.factor(y))
hq <- ncol(embeds)
dat <- as.data.frame(cbind(y=y, x=embeds))
dat$y <- factor(dat$y)
name <- c('y', paste0('V', 1:hq))
names(dat) <-name
formu <- paste0("y~")
for(i in 1:hq){
if(i < hq){
formu <- paste(formu, name[i+1], seq='+')
}else{
formu <- paste(formu, name[i+1], seq='')
}
}
model1 <- nnet::multinom(as.formula(formu), data = dat)
R2 <- r2_mcfadden(model1)
return(R2$R2_adjusted)
}
# My methods --------------------------------------------------------------
select.bw <- function(X, scale=TRUE, nsample=100, seed=1){
set.seed(seed)
if(scale) X <- scale(X)
n_spots <- nrow(X)
idx <- sample(n_spots, min(nsample, n_spots))
dis <-stats::dist(X[idx,])
sigma <- 1/sqrt(mean(dis^2))
return(sigma)
}
Tr <- function(x) sum(diag(x))
Diag <- function (vec)
{
q <- length(vec)
if (q > 1) {
y <- diag(vec)
}
else {
y <- matrix(vec, 1, 1)
}
return(y)
}
linear.prog <- function(y, s= 50){
library(Rglpk)
p <- length(y)
A <- matrix(1, nrow = 1, ncol=p, byrow = TRUE)
bounds <- list(lower=list(ind=1:p, val=rep(0, p)),
upper = list(ind=1:p, val=rep(1, p)))
result <- Rglpk_solve_LP(obj = y, mat = A, dir = c("<="), rhs = s,
bounds = bounds, types = "C",max=TRUE, maxit = 10000, tol.abs = 1e-09, tol.rel = 1e-09)
optimal_solution <- result$solution
return(optimal_solution)
}
sketchfun <- function(m, n, type=c("subGaussian", "ROS", "subSampling")){
library(Matrix)
Praw <- Diagonal(n, x = 1)
ind <- sample(n, m)
R <- Diagonal(n, 1/2 - rbinom(n, 1, 0.5))
H <- Praw
S <- switch(type,
subGaussian = matrix(rnorm(n*m),m,n),
ROS = sqrt(n/m)*Praw[ind,]%*% H %*% R,
subSampling = sqrt(n/m)*Praw[ind,])
return(S)
}
# oKPCA <- function(X,d, kern=rbfdot(sigma=0.5)){
#
#
# n <- nrow(X)
# tic <- proc.time()
# K <- kernelMatrix(kern, as.matrix(X))
# eigK <- eigen(K, symmetric = TRUE)
# toc <- proc.time()
# res <- list()
# res$alpha <- eigK$vectors[,1:d]
# res$K <- K
# res$time.use <- toc[3] - tic[3]
# return(res)
# }
#
SKPCA.R <- function(X, d, m=ceiling(sqrt(nrow(X))), Stype=c('subGaussian', 'ROS', 'subSampling'),
kern=rbfdot(sigma=0.5), rescale=TRUE, seed=2){
library(irlba)
Stype <- match.arg(Stype)
tic <- proc.time()
n <- nrow(X)
if(rescale) X <- scale(X) ## scale X
K <- kernelMatrix(kern, as.matrix(X))
# Krow <- apply(K,2, mean)
# Ksum <- mean(Krow)
#K <- K - matrix(Krow, n,n, byrow=T) - matrix(Krow, n, n) + Ksum
set.seed(seed) # reproducible
S <- sketchfun(m,n, Stype)
SKS <- S %*% K %*% t(S)
eigSKS <- eigen(SKS, symmetric = TRUE)
V <- K %*% t(S) %*% (eigSKS$vectors) %*% Diag(1/sqrt(eigSKS$values))
svdV <- irlba(V, nv=d)
toc <- proc.time()
res <- list()
res$A <- svdV$u
res$K <- V %*% t(V)
res$time.use <- toc[3] - tic[3]
class(res) <- 'sKPCA'
return(res)
}
# sKPCA.Cpp <- function(X, d, m=ceiling(sqrt(nrow(X))), Stype=c('subGaussian', 'ROS', 'subSampling'),
# kern=rbfdot(sigma=0.5), rescale=TRUE,seed=2){
#
# # d=4; m=50;Stype='subGaussian';
# # kern=rbfdot(sigma=0.5); seed=2;
# Stype <- match.arg(Stype)
# if(rescale) X <- scale(X) ## scale X
# n <- nrow(X)
# K <- matrix(0, nrow=n, ncol=n); A <- matrix(0, nrow=n, ncol=d)
# tic <- proc.time()
# set.seed(seed)
# res.cpp <- SKPCA_cpp(X, kern, d=d, m=m, Stype=Stype)
# toc <- proc.time()
#
# # res.cpp <- list(A=A, K= K)
# res.cpp$time.use <- toc[3] - tic[3]
# return(res.cpp)
# }
#
# gendata_LFM <- function(seed=1, n=400, p=200, p_nz=50, q=6, sigma2_e=0.01){
# set.seed(seed)
# library(dplyr)
# H <- rnorm(n*q) %>% matrix(nrow=n, ncol=q) %>% qr %>% qr.Q()
# Bg <- rnorm(p_nz*q) %>% matrix(nrow=p_nz, ncol=q) %>% qr %>% qr.Q()
# Bg <- Bg %*% diag(c(10, 8, 7, 5, 3, 1))
# B <- rbind(Bg, matrix(0, p-p_nz, q))
# sigma2_e <- 0.01
# U <- MASS::mvrnorm(n, mu=rep(0, p), Sigma = sigma2_e*diag(rep(1,p)))
# X <- H %*% t(B) + U
# nz.index <- 1:p_nz
# return(list(X, H0=H, nz.index=nz.index))
# }
# SKPCA.FS.R <- function(X, dim.PCs=6, num.fea=50, dim.sketch=round(sqrt(nrow(X))),
# kern.type = c("linear", "gaussian"), gaussian.method=c("M1", "M0"),
# kernel=NULL, Stype=c('subGaussian', 'ROS', 'subSampling'),
# use.sketch=TRUE, maxIter=20, epsObj = 1e-8){
# # kern.type = c( "gaussian"); use.sketch=TRUE; maxIter=20; epsObj = 1e-8
# # num.fea=4; dim.PCs=2; m <- 50; kern <- rbfdot(sigma=0.1)
# require(kernlab)
# require(irlba)
# Stype <- match.arg(Stype)
# X <- scale(X)
# tic <- proc.time()
# kern.type <- match.arg(kern.type)
# gaussian.method <- match.arg(gaussian.method)
# objfun <- function(X, alpha1, wvec){
# n <- nrow(X)
# #alpha1_col <- matrix(alpha1, n, 1)
# y <- t(alpha1) %*% X %*% t(X * matrix(wvec, n, p, byrow=T)) %*% alpha1
# return(Tr(y))
# }
#
# p <- ncol(X); n <- nrow(X);
# s <- num.fea; q <- dim.PCs; m <- dim.sketch
# obj.vec <- rep(NA, maxIter)
# obj.vec[1] <- -1e10
#
# if(q >s) stop("dim.PCs must be less than num.fea!")
# if(q<2) stop("dim.PCs must be greater than 1!")
# ### Initialize
# wvec <- rep(1, p)
# alpha1 <- irlba(X, nv=q)$u[,1:q] ## can be speed up using ilrba
#
# if(is.null(kernel)){
# if(kern.type=='linear'){
# kern <- polydot(degree = 1)
# }else if(kern.type=='gaussian'){
# kern <- rbfdot(sigma=0.1)
# }
#
# }
# if(!is.null(kernel)){
# kern <- kernel
# }
#
# if(kern.type == 'gaussian' && gaussian.method == 'M0'){
# Kcube <- array(dim=c(n, n, p))
# for(l in 1:p){
# Kcube[,,l] <- kernelMatrix(kern,X[,l])
# }
#
# }
# if(kern.type == 'gaussian' && gaussian.method == 'M1'){
# res.skpca <- sKPCA.Cpp(X=X * matrix(wvec, n, p, byrow=TRUE), d=q, m=m, kern = kern)
# K <- res.skpca$K
# }
#
#
# # message("check point1????")
# for(iter in 2:maxIter){
#
# # iter <- 2
#
# # update wvec
# if(kern.type=='linear'){
# y <- rowSums((t(X) %*% alpha1)^2)
# }else if(kern.type=='gaussian'){
# y <- rep(NA, p)
# if(gaussian.method=='M1'){
# for(l in 1:p){
# tmpMat <- t(alpha1) %*% (-2*K * (matrix(X[,l], n, n)- matrix(X[,l], n, n, byrow=T))^2*kern@kpar$sigma) %*% alpha1
# y[l] <- Tr(tmpMat) # * wvec[l]
# }
#
# }else if(gaussian.method == 'M0'){
# for(l in 1:p){
# tmpMat <- t(alpha1) %*% (Kcube[,,l]- 0.1) %*% alpha1
# y[l] <- Tr(tmpMat) # * wvec[l]
# }
# }
#
#
# }
# cat(y, '\n')
# # message("check point2????")
# #wvec <- linear.prog(y, s=s)
# wvec <- rep(0,p)
# idx <- order(y, decreasing = TRUE)
# wvec[idx[1:s]] <- 1
# # wy <- update_w( X, alpha1, as.matrix(K), s, kern.type, Kg_h=1/kern@kpar$sigma)
# # wvec <- wy[,1]
# # y <- wy[,2]
# cat(wvec, '\n')
# cat(y, '\n')
# ## update alpha: svd
# # obj1 <- objfun(X, alpha1, wvec)
# ### speed using sktech
# if(use.sketch){
# res.skpca <- sKPCA.Cpp(X=X[,idx[1:s]], d=q, m=m, kern = kern, Stype= Stype)
# alpha1 <- res.skpca$A
# K <- res.skpca$K
# }else{
# K <- kernelMatrix(kern, X[,idx[1:s]])
# alpha1 <- eigen(K)$vectors[,1:q]
# }
# # obj2 <- objfun(X, alpha1, wvec)
# # message("dalpha1 = ", obj2-obj1)
# # message("check point3????")
#
# obj3 <- objfun(X, alpha1, wvec)
# # message("dpvec = ", obj3-obj2)
# obj.vec[iter] <- obj3
# dObj <- abs(obj.vec[iter]-obj.vec[iter-1])/abs(obj.vec[iter-1])
# message("Obj=", round(obj.vec[iter], 4) , ", dObj = ", dObj)
# if(dObj < epsObj){
# break
# }
# }
# toc <- proc.time()
# time.use <- toc[3] - tic[3]
# if(kern.type=='linear'){
# importance.scores <- y/sum(y)
# }else if(kern.type=='gaussian'){
# if(gaussian.method=='M1'){
# y.scale <- y/max(abs(y))
# importance.scores <- exp(y.scale/2)/sum(exp(y.scale/2))
#
# }else if(gaussian.method == 'M0'){
# for(l in 1:p){
# tmpMat <- t(alpha1) %*% (Kcube[,,l]- 0.1) %*% alpha1
# y[l] <- Tr(tmpMat) # * wvec[l]
# }
# }
# }
#
# reslist <- list(A = alpha1, w = as.integer(wvec>0), importance.scores=importance.scores, obj.seq = obj.vec[1:iter][-1],
# time.use = time.use)
# return(reslist)
# }
#
#
# SKPCA.FS <- function(X, dim.PCs=6, num.fea=50, dim.sketch=round(sqrt(nrow(X))),
# kern.type = c("linear", "gaussian"), gaussian.method=c("M1", "M0"),
# kernel.use=NULL, Stype=c('subGaussian', 'ROS', 'subSampling'),
# rescale = TRUE,
# use.sketch=TRUE, maxIter=20, epsObj = 1e-8){
# # kern.type = c( "gaussian"); use.sketch=TRUE; maxIter=20; epsObj = 1e-8
# # num.fea=4; dim.PCs=2; m <- 50; kern <- rbfdot(sigma=0.1)
# # kern.type <- 'linear'
# require(kernlab)
# require(irlba)
# Stype <- match.arg(Stype)
# tic <- proc.time()
# kern.type <- match.arg(kern.type)
# gaussian.method <- match.arg(gaussian.method)
#
# if(rescale) X <- scale(X)
# p <- ncol(X); n <- nrow(X);
# s <- num.fea; q <- dim.PCs; m <- dim.sketch
# obj.vec <- rep(NA, maxIter)
# obj.vec[1] <- -1e10
#
# if(q >s) stop("dim.PCs must be less than num.fea!")
# if(q<2) stop("dim.PCs must be greater than 1!")
# ### Initialize
# wvec <- rep(1, p)
# alpha1 <- irlba(X, nv=q)$u[,1:q]
#
# if(!is.null(kernel.use)){
# kern <- kernel.use
# }
# if(is.null(kernel.use)){
# if(kern.type=='linear'){
# kern <- polydot(degree = 1)
# }else if(kern.type=='gaussian'){
# kern <- rbfdot(sigma=0.1)
# }
# }
# if(kern.type=='linear'){
# h <- 1
# }else if(kern.type=='gaussian'){
# h <- 1/kern@kpar$sigma
# }
# # if(use.sketch){
# # res.skpca <- sKPCA.Cpp(X=X * matrix(wvec, n, p, byrow=TRUE), d=q, m=m, kern = kern)
# # K <- res.skpca$K
# # }else{
# # K <- kernelMatrix(kern, X)
# # }
# K <- kernelMatrix(kern, X)
# if(kern.type == 'gaussian' && gaussian.method == 'M0'){
# Kcube <- array(dim=c(n, n, p))
# for(l in 1:p){
# Kcube[,,l] <- kernel.useMatrix(kern,X[,l])
# }
#
# }
#
#
#
# # message("check point1????")
# for(iter in 2:maxIter){
#
# # iter <- 2
# wy <- update_w( X, alpha1, as.matrix(K), s, kern.type, Kg_h=h)
# wvec <- wy[,1]
# y <- wy[,2]
# idx <- order(y, decreasing = TRUE)
# # cat(wvec, '\n')
# # cat(y, '\n')
# ## update alpha: svd
# # obj1 <- objfun(X, alpha1, wvec)
# ### speed using sktech
# if(use.sketch){
# res.skpca <- sKPCA.Cpp(X=X[,idx[1:s]], d=q, m=m, kern = kern, Stype= Stype)
# alpha1 <- res.skpca$A
# K <- res.skpca$K
# }else{
# K <- kernel.useMatrix(kern, X[,idx[1:s]])
# alpha1 <- eigen(K)$vectors[,1:q]
# }
# # obj2 <- objfun(X, alpha1, wvec)
# # message("dalpha1 = ", obj2-obj1)
# # message("check point3????")
#
# # obj3 <- objfun(X, alpha1, wvec)
# obj3 <- objfun_Cpp(X[,idx[1:s]], alpha1)
# # message("dpvec = ", obj3-obj2)
# obj.vec[iter] <- obj3
# dObj <- abs(obj.vec[iter]-obj.vec[iter-1])/abs(obj.vec[iter-1])
# message("Obj=", round(obj.vec[iter], 4) , ", dObj = ", dObj)
# if(dObj < epsObj){
# break
# }
# }
# toc <- proc.time()
# time.use <- toc[3] - tic[3]
# if(kern.type=='linear'){
# importance.scores <- y/sum(y)
# }else if(kern.type=='gaussian'){
# if(gaussian.method=='M1'){
# y.scale <- y/max(abs(y))
# importance.scores <- exp(y.scale/2)/sum(exp(y.scale/2))
#
# }else if(gaussian.method == 'M0'){
# for(l in 1:p){
# tmpMat <- t(alpha1) %*% (Kcube[,,l]- 0.1) %*% alpha1
# y[l] <- Tr(tmpMat) # * wvec[l]
# }
# }
# }
#
# reslist <- list(A = alpha1, w = as.integer(wvec>0), importance.scores=importance.scores,
# time.use = time.use)
# return(reslist)
# }
#
# Compared methods --------------------------------------------------------
kpcaIG.fit <- function(X,dim.PCs=6, num.fea = 50, kern.type=c('polydot', "rbfdot")){
# dim.PCs=6; num.fea = 50; kern.type=c('polydot')
library(kpcaIG)
kern.type <- match.arg(kern.type)
tic <- proc.time()
# compute one kernel.use for the psychem dataset
p <- ncol(X)
colnames(X) <- 1:p
kpca_tan <- kernelpca(X, kernel = kern.type, features = dim.PCs)
kpca_ig_tan <- kpca_igrad(kpca_tan, dim = 1:dim.PCs)
row.names(kpca_ig_tan) <- kpca_ig_tan$column_names
kpca_ig_tan_raw <- kpca_ig_tan[as.character(1:p),]
imp.var.permute <- as.numeric(kpca_ig_tan$column_names)[1:num.fea]
toc <- proc.time()
time.use <- toc[3] - tic[3]
return(list(PCs=kpca_tan@pcv, fea.select=imp.var.permute, means_norms =kpca_ig_tan_raw$means_norms ,time.use=time.use, fit=kpca_ig_tan_raw))
}
KPCA.Permute.fit <- function(X, dim.PCs=6, num.fea = 50, kern.type= c("linear", "gaussian.radial.basis")){
library(mixKernel)
kern.type <- match.arg(kern.type)
tic <- proc.time()
# compute one kernel.use for the psychem dataset
p <- ncol(X)
colnames(X) <- 1:p
phychem.kernel.use <- compute.kernel(X, kernel.func = kern.type)
# perform a KPCA
kernel.use.pca.result <- kernel.pca(phychem.kernel.use, ncomp = dim.PCs)
# compute importance for all variables in this kernel.use
kernel.use.pca.result <- kernel.pca.permute(kernel.use.pca.result, phychem=1:p)
imp.var.permute <- order(kernel.use.pca.result$cc.distances, decreasing = T)[1:num.fea]
toc <- proc.time()
time.use <- toc[3] - tic[3]
return(list(PCs=kernel.use.pca.result$x, fea.select=imp.var.permute, cc.dist =kernel.use.pca.result$cc.distances ,time.use=time.use))
}
SPCA.fit <- function(X, dim.PCs=6, num.fea=50, method=c("SPCA", "RSPCA"), deltaK=2,
logalpha = -4, logalpha.start=-5, logalpha.end=-1, log.step=0.05){
library(sparsepca)
method <- match.arg(method)
tic <- proc.time()
q <- dim.PCs
message("search the optimal penalty parameters...")
logalpha.set <- seq(logalpha.start, logalpha.end, by=log.step)
for(i in seq_along(logalpha.set)){
# i <- 1
logalpha <- logalpha.set[i]
if(method=='SPCA'){
out.spca <- spca(X, k=q, alpha=10^logalpha, beta=1e-3, center = TRUE, scale = FALSE, verbose=0)
}else if(method=='RSPCA'){
out.spca <- rspca(X, k=q, alpha=10^logalpha, beta=1e-3, center = TRUE, scale = FALSE, verbose=0)
}
num.ftmp <- sum(abs(rowSums(out.spca$loadings)) >0)
message('i = ', i, ", logalpha=", logalpha, ", num: ", num.ftmp)
if(num.ftmp <= num.fea+deltaK){
logalpha.opt <- logalpha
break
}
}
if(i == length(logalpha.set)){
logalpha.opt <- logalpha
}
if(method=='SPCA'){
out.spca <- spca(X, k=q, alpha=10^logalpha.opt, beta=1e-3, center = TRUE, scale = FALSE, verbose=0)
}else if(method=='RSPCA'){
out.spca <- rspca(X, k=q, alpha=10^logalpha.opt, beta=1e-3, center = TRUE, scale = FALSE, verbose=0)
}
toc <- proc.time()
out.spca$time.use <- toc[3] - tic[3]
return(out.spca)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.