#######################################################################
# Tuning functions for classification
#######################################################################
### Unsupervised km tuning
tuning_km=function(x_datta, n_hlist, edr_list, centers, iter.max=100, nstart = 5){
tune_km = lapply(1:n_hlist, function(k) #k=5; x_datta=x; centers=n_centers
(kmeans(t( t(edr_list[[k]])%*%t(x_datta) ), centers, iter.max, nstart)$tot.withinss)/
(kmeans(t( t(edr_list[[k]])%*%t(x_datta) ), centers, iter.max, nstart)$betweenss)
)
return(Reduce('c', tune_km))
}
#' This is an internal function called by opcg
#'
#'
#'
#' @keywords internal
#' @noRd
#'
tuning_skm <- function(x, y, d, class_labels, n_cpc, n_hlist, edr_list,
iter.max = 100, nstart = 100) {
# x=X_tune; y=Y_tune; d=km_d; n_classes=m; n_cpc=1; n_hlist=length(h_list);
# edr_list; nstart = 200; iter.max = 100
n_classes=length(class_labels);
if (length(n_cpc)==1) n_cpc_vec=rep(n_cpc, n_classes) else n_cpc_vec=n_cpc;
sapply(1:n_hlist, function(k) {
# k=2
km_ss=lapply(1:n_classes, function(l) {
# l=3
# SDR Predictors, "n x p" format
sdr_pred=t( t(edr_list[[k]][,1:d])%*%
t(x[which(y==class_labels[l]),]) );
class_grand_mean=colMeans(sdr_pred);
km=kmeans(sdr_pred, centers=n_cpc_vec[l], iter.max = iter.max, nstart = nstart )
# Sum of WSS for the clusters of a given class l;
per_class_wss=km$tot.withinss
# the estimated centers of the given class l;
per_class_cen=km$centers
# The BSS of clusters (i.e. centers) by class
if (n_cpc_vec[l]>1) {
per_class_cen_wss = sum(diag( var(per_class_cen)*(n_cpc_vec[l]-1) ))
} else {
per_class_cen_wss = 0
}
return( list(per_class_wss, per_class_cen,
per_class_cen_wss,
class_grand_mean) )
})
# Want a measure of distance from centers of other classes
# want a measure of distance from within class centers
# list of all the estimated centers
class_cen <- lapply(1:n_classes, FUN=function(j) km_ss[[j]][[2]] )
# Adding up the WSS by classes
class_wss <- sum( sapply(1:n_classes, FUN=function(j) km_ss[[j]][[1]] ) )
# Adding up the Tot.SS of the centers by class; effectively the supervised BSS
class_grand_means<- lapply(1:n_classes, FUN=function(j) km_ss[[j]][[4]] )
# The idea is that if the class has just one cluster, then the variance between
# intra class centers is 0, because it is just one center/point.
# The idea behind the class_bss is to add up all the sums of squares for
# intra-class centers and this is a sort of measure of between center sums
# of squares.
# We can use class_bss1 for the sum of intra-class between SS
# We can use class_bss2 for the total variance of class centers
# We can use class_bss3 for the between class SS (calculate the mean of each class)
class_bss1= sum( sapply(1:n_classes, FUN=function(j) km_ss[[j]][[3]]));
class_bss2= sum(diag(var( Reduce('rbind', class_cen) )))*(length(n_cpc_vec)-1);
# class_bss3= sum(diag(var( Reduce('rbind', class_grand_means) )))*(n_classes-1);
if ( sum(n_cpc_vec)>n_classes) {
return( class_wss/ class_bss2 )#max(class_bss1, class_bss2) ) #
} else if(sum(n_cpc_vec)==n_classes) {
return( class_wss/class_bss2 )#max(class_bss1, class_bss2) )#
}
} )
}
# ### Weighted Supervised km tuning
#' This is an internal function called by opcg
#'
#'
#'
#' @keywords internal
#' @noRd
#'
tuning_skmkm <- function(x, y, d, class_labels, n_cpc, n_hlist, edr_list,
iter.max = 100, nstart = 100) {
# x=X_tune; y=Y_tune; d=km_d; n_classes=m; n_cpc=1; n_hlist=length(h_list);
# edr_list; nstart = 200; iter.max = 100
# x=X_test; y=Y_test; d=10; class_labels=m_classes; n_cpc=1; n_hlist=length(bw_list);
# edr_list; nstart = 100; iter.max = 100
n_classes=length(class_labels);
#Supervised Tuning
skm_tune=tuning_skm(x, y, d, class_labels, n_cpc, n_hlist, edr_list,
iter.max, nstart)
#Unsupervised Tuning
# number of centers for unsupervised is n_classes*n_cpc if n_cpc is a
# number, i.e. each class has same number, otherwise, it should be sum(n_cpc)
n_centers = if (length(n_cpc)==1) n_classes*n_cpc else sum(n_cpc); #n_classes #
km_tune=tuning_km(x, n_hlist, edr_list,
centers=n_centers, iter.max=iter.max, nstart = nstart)
# Standardizing the ratios
skm_tune_std=(skm_tune-mean(skm_tune))/sd(skm_tune)
km_tune_std=(km_tune-mean(km_tune))/sd(km_tune)
return(list(wskm= (.5*skm_tune_std + .5*km_tune_std),
km_std=km_tune_std,
skm_std=skm_tune_std ))
}
#
# tuning_skmkm <- function(x, y, d, n_classes, n_cpc, n_hlist, edr_list,
# iter.max = 100, nstart = 100) {
# # x=X_tune; y=Y_tune; d=km_d; n_classes=m; n_cpc=1; n_hlist=length(h_list);
# # edr_list; nstart = 200; iter.max = 100
#
# skm_tune=tuning_skm(x, y, d, n_classes, n_cpc, n_hlist, edr_list,
# iter.max, nstart)
# skm_tune_std=(skm_tune-mean(skm_tune))/sd(skm_tune)
#
# km_tune=tuning_km(x, n_hlist, edr_list,
# centers=n_classes*n_cpc, iter.max=iter.max, nstart = nstart)
# km_tune_std=(km_tune-mean(km_tune))/sd(km_tune)
#
# return(.5*skm_tune_std + .5*km_tune_std)
#
# }
############################################################################
### K-fold K-Means Tuning
############################################################################
#' K-fold Tuning with K-means
#'
#' This implements the tuning procedure for SDR and classification problems
#' in the forth coming paper Quach and Li (2021).
#'
#' The kernel for the local linear regression is fixed at a gaussian kernel.
#'
#' For large 'p', we strongly recommend using the Conjugate Gradients implement,
#' by setting method="cg".
#' For method="cg", the hybrid conjugate gradient of Dai and Yuan is implemented,
#' but only the armijo rule is implemented through backtracking, like in Bertsekas'
#' "Convex Optimization Algorithms".
#' A weak Wolfe condition can also be enforced by adding setting c_wolfe > 0
#' in the control_list, but since c_wolfe is usually set to 0.1 (Wikipedia)
#' and this drastically slows down the algorithm relative to newton for small to
#' moderate p, we leave the default as not enforcing a Wolfe condition, since we
#' assume that our link function gives us a close enough initial point that local
#' convergence is satisfactory. Should the initial values be suspect, then maybe
#' enforcing the Wolfe condition is a reasonable trade-off.
#'
#' @param d specified the reduced dimension
#' @param method "newton" or "cg" methods; for carrying out the optimization using
#' the standard newton-raphson (i.e. Fisher Scoring) or using Congugate Gradients
#' @param parallelize Default is False; to run in parallel, you will need to have
#' foreach and some parallel backend loaded; parallelization is strongly recommended
#' and encouraged.
#' @param control_list a list of control parameters for the Newton-Raphson
#' or Conjugate Gradient methods
#' @param ytype specify the response as 'continuous', 'multinomial', or 'ordinal'
#' @param h_list
#' @param k
#' @param x_datta
#' @param y_datta
#' @param class_labels
#' @param n_cpc
#' @param iter.max
#' @param nstart
#'
#' @return A list containing both the estimate and candidate matrix.
#' \itemize{
#' \item opcg - A 'pxd' matrix that estimates a basis for the central subspace.
#' \item opcg_wls - A 'pxd' matrix that estimates a basis for the central subspace based
#' on the initial value of the optimization problem; useful for examining bad starting
#' values.
#' \item cand_mat - A list that contains both the candidate matrix for OPCG and for
#' the initial value; this is used in other functions for order determination
#' \item gradients - The estimated local gradients; used in regularization of OPCG
#' \item weights - The kernel weights in the local-linear GLM.
#' }
#'
#' @export
#'
kfold_km_tuning=function(h_list, k, x_datta, y_datta, d, ytype,
class_labels, n_cpc, method="newton", std="none",
parallelize = F,
control_list=list(),
iter.max = 100, nstart = 100) {
n=dim(x_datta)[1]; n_hlist = length(h_list);
p=dim(x_datta)[2];
# k = 3;
# x_datta=X; y_datta=Y; ytype="multinomial"; parallelize=T
# class_labels=m_classes; iter.max = 100; nstart = 100
# method="cg"; control_list=list();
partitions=sample(1:(k),size=n,replace=T,prob=rep(1/(k),(k) ) );
X_folds=lapply(1:k, function(fold) x_datta[which(partitions==fold),])
Y_folds=lapply(1:k, function(fold) y_datta[which(partitions==fold)])
# Y_folds=lapply(1:k, function(fold) matrix(y_datta[,which(partitions==fold)], 1,
# table(partitions)[fold]) )
# edr_whole=list();
km_whole=list();
km_std=list();
skm_std=list();
for(fold in 1:k) {
# fold=2;
X_tune=X_folds[[fold]]; Y_tune=Y_folds[[fold]];
X_train=x_datta[-which(partitions==fold), ];
Y_train=y_datta[-which(partitions==fold) ] ;
if (std=="cov") {
X_train=matpower_cpp(cov(t(X_train)), -1/2)%*%
t(sapply(1:p, FUN= function(j) center_cpp(X_train[j,], NULL) ) );
X_tune=matpower_cpp(cov(t(X_tune)), -1/2)%*%
t(sapply(1:p, FUN= function(j) center_cpp(X_tune[j,], NULL) ) );
} else if (std=="marg"){
# Standardize marginally at least to bring variation in line, if total
# isotropic transformation is not possible due to high correlation between
# predictors.
X_train=t(sapply(1:p, FUN= function(j) center_cpp(X_train[j,], NULL)/sd(X_train[j,]) ) );
X_tune=t(sapply(1:p, FUN= function(j) center_cpp(X_tune[j,], NULL)/sd(X_tune[j,]) ) );
}
# dim(X_tune)[2] + dim(X_train)[2]
# table(partitions)
edr_list=list()
for (iter in 1:n_hlist) {
edr_iter=opcg(x=X_train, y=Y_train, d=d, bw=h_list[iter],
ytype=ytype, method=method, parallelize=parallelize,
control_list=control_list)
edr_list[[iter]]=edr_iter;
}
km_tune=tuning_skmkm(X_tune, Y_tune, d, class_labels, n_cpc, n_hlist, edr_list,
iter.max, nstart)
# edr_whole[[fold]]=edr_list;
km_whole[[fold]]=km_tune$wskm;
km_std[[fold]]=km_tune$km_std;
skm_std[[fold]]=km_tune$skm_std;
}
# return(list(edr=edr_whole, km=km_whole))
return(list(wskmf_ratio=Reduce('+', km_whole)/k,
skm_f_ratio=Reduce('+', skm_std)/k,
km_f_ratio=Reduce('+', km_std)/k));
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.