#' Port of Tomioka and Suzuki's SpicyMKL to R, expanded for multiclass and probability outputs.
#' @title spicer
#' @param K N x N x M array. the (i,j,m)-element contains the (i,j)-element of the m-th kernel gram matrix.
#' @param yapp vector of length N with sample labels. It should be a factor for binary/multiclass classification
#' @param C regularization parameter . Large values of C induce strong regularization. For L1 regularization C is a scalar; for elasticnet, C is a vector of length 2: C(1)|x| + C(2)x^2/2
#' @param opt list of options which control spicer behavior:
#' \describe{
#' \item{loss}{type of loss function:'logit' (logistic regression, log(1+exp(- f(x)y))) for classification,
#' 'square' (square loss, 0.5*(y - f(x))^2) for regression}
#' \item{regname}{type of regularization: 'l1' (default), 'elasticnet'}
#' \item{optname}{optimization solver for dual variable (rho) inner minimization: 'Newton' (default), 'BFGS'}
#' \item{stop_duality_gap}{TRUE/FALSE. If TRUE, Spicer employs duality gap for stopping criterion of outer loop. Default TRUE.}
#' \item{stop_ineq_violation}{TRUE/FALSE. If TRUE, Spicer employs violation of inequality for stopping criterion of outer loop. Default FALSE.}
#' \item{outer_maxiter}{maximum number of iterations of outer loop. (default 300)}
#' \item{inner_maxiter}{maximum number of iterations of inner loop. (default 500)}
#' \item{tol_outer}{tolerance of stopping criteria of outer loop. (default 0.001)}
#' \item{tol_inner}{tolerance of stopping criteria of inner loop. (default tol_outer/1000)}
#' \item{miniter}{minimum number of iterations of outer loop. (default 30)}
#' \item{tol_miniter}{minimum relative improvement required for each outer iteration after miniter. (default 0.001)}
#' \item{calpha}{increment factor of gamma: gamma^(t+1)=calpha*gamma^(t). (default 10)}
#' \item{incl_subw}{TRUE/FALSE. If TRUE, Spicer includes the full NxM matrix of alpha coefficients. Default FALSE.}
#' \item{display}{1:display no progress messages, 2(default):display outer loop progress messages, 3:display inner loop progress messaages.}
#' }
#' @return A SPICER model with the following components:
#' \describe{
#' \item{comb_alpha}{N x 1 coefficient vector.}
#' \item{kern_weight}{1 x M kernel weight vector, scaled to sum to 1}
#' \item{bias}{bias term}
#' \item{activeset}{indices of kernels that are active ({m : kern_weight[m] is not zero}).}
#' \item{sorted_kern_weight}{vector of non-zero kernel weights sorted by magnitude, scaled to sum to 1.}
#' \item{opt}{list of SPICER options used in run.}
#' \item{history}{contains history of primal objective, dual objective, number of active kernels, and duality gap.}
#' \item{kern_alpha}{If incl_subw is TRUE, the NxM matrix of alpha coefficients.}
#' }
#' @references
#' \itemize{
#' \item V. Uzunangelov, C. K. Wong, and J. Stuart. Highly Accurate Cancer Phenotype Prediction with AKLIMATE, a Stacked Kernel Learner Integrating Multimodal Genomic Data and Pathway Knowledge. bioRxiv, July 2020.
#' \item Suzuki,Tomioka.SpicyMKL: a fast algorithm for Multiple Kernel Learning with thousands of kernels. Mach Learn (2011) 85:77–108
#' }
#' @export
spicer <- function(K, yapp, C, opt = list()) {
if (is.factor(yapp) && length(levels(yapp)) > 2) {
res <- spicer.multiclass(K, yapp, C, opt)
} else {
res <- spicer.default(K, yapp, C, opt)
}
return(res)
}
####################################################
spicer.multiclass <- function(K, yapp, C, opt = list()) {
opt$loss <- "logit"
lvls <- levels(yapp)
combos <- combn(lvls, 2)
res <- foreach(i = 1:ncol(combos)) %do% {
idx <- yapp %in% combos[, i]
spicer.default(K[idx, idx, , drop = FALSE],
factor(yapp[idx], levels = combos[, i]), C, opt)
}
cw <- rep(0, dim(K)[3])
names(cw) <- dimnames(K)[[3]]
for (i in 1:length(res)) {
cw[names(res[[i]]$sorted_kern_weight)] <- cw[names(res[[i]]$sorted_kern_weight)] + res[[i]]$sorted_kern_weight
}
cw <- cw[cw > 0]
cw <- sort(cw, decreasing = TRUE)/sum(cw)
## set attribute of res to the overall kernel weights
res <- structure(res, sorted_kern_weight = cw)
class(res) <- c("spicer", class(res))
return(res)
}
#######################################################
spicer.default <- function(K, yapp, C, opt=list()) {
if (is.null(opt$loss)) opt$loss <- if (length(unique(yapp)) == 2) "logit" else "square"
if (is.null(opt$regname))
opt$regname <- "l1"
if (is.null(opt$optname))
opt$optname <- "Newton" ## choices are Newton and BFGS
## if C has two components(elastic net), but the second one is zero, revert to l1-regularization
if (length(C) == 2 && C[2] == 0) {
opt$regname = "l1"
C = C[1]
}
## convert labels to -1,1
if (opt$loss %in% c("logit")) {
## change a factor to a numeric vector
if (is.factor(yapp)) {
lyapp <- levels(yapp)
yapp <- as.numeric(yapp)
}
## labels were numbers represented as character strings
if (is.character(yapp))
yapp <- as.numeric(yapp)
uniqYapp <- sort(unique(yapp))
yapp[yapp == uniqYapp[1]] <- -1
yapp[yapp == uniqYapp[2]] <- 1
opt$classes <- if (exists("lyapp"))
lyapp else uniqYapp
}
if (is.null(opt$tol_outer))
opt$tol_outer <- 0.001
if (is.null(opt$tol_inner))
opt$tol_inner <- opt$tol_outer/1000
if (is.null(opt$outer_maxiter))
opt$outer_maxiter <- 300
if (is.null(opt$inner_maxiter))
opt$inner_maxiter <- 500
if (is.null(opt$calpha))
opt$calpha <- 10
if (is.null(opt$stop_ineq_violation))
opt$stop_ineq_violation <- FALSE
if (is.null(opt$stop_duality_gap))
opt$stop_duality_gap <- TRUE
if (is.null(opt$miniter))
opt$miniter <- 30
if (is.null(opt$tol_miniter))
opt$tol_miniter <- 0.001
if (is.null(opt$display))
opt$display <- 2
if (is.null(opt$incl_subw))
opt$incl_subw <- FALSE
opt$C <- C
opt$nkern <- dim(K)[3]
## selection of regularization functions
expand(get_reg_funcs(opt$regname))
## set up trajectory tracking and some defaults
history <- data.frame(primalobj = double(), dualobj = double(), numActiv = integer(), dualityGap = double(), elapsedTime = double(), stringsAsFactors = FALSE)
nowTime <- Sys.time()
oldTime <- nowTime
elpsdTime <- 0
N <- length(yapp)
M <- dim(K)[3]
oneN <- rep(1, N) #matrix(1,nrow=N,ncol=1)
oneM <- rep(1, M) #matrix(1,nrow=1,ncol=M)
## lagrangian for equality constraint on new variable z (rho in SpicyMKL Tomioka Suzuki JMLR 2011)
rho <- -yapp/2
## kernel weights divided by gamma (alpha/gamma in SpicyMKL Tomioka Suzuki JMLR 2011)
krnlWMod <- matrix(0, nrow = N, ncol = M)
## bias term intial estimate
cb <- 0
## elastic net appears to be a lot closer to smoothness under the heuristic below, so no need to start with a very small gamma value
if (opt$regname == "elasticnet" && C[2]/C[1] > 5/95) {
cgamma <- rep(1000, M) #matrix(1000,nrow=M,ncol=1)
cgammab <- 1000
} else {
cgamma <- rep(10, M) #matrix(10,nrow=M,ncol=1)
cgammab <- 1
}
if (opt$optname == "BFGS") {
hess <- diag(1, N)
}
## measure of KKT constraint violation
ck <- Inf
cbeta <- 1/2
## input for proximal update for the kernel weights alpha/gamma + rho in SpicyMKL Tomioka Suzuki JMLR 2011 remember, N=length(yapp)=length(rho)
krnlWPrIn <- matrix(rho, N, M) + krnlWMod
expand(kernel_norm_cpp(K, krnlWPrIn), c("wdot", "normj"))
## this is the proximal of normj (not proximal of krnlWPrIn, which goes in the krnlWMod update equaiton)
pr <- prox(normj * cgamma, C, cgamma)
activeset <- which(pr > 0)
## outer loop
for (l in 1:opt$outer_maxiter) {
## inner loop
for (step in 1:opt$inner_maxiter) {
sumrho <- sum(rho)
yrho <- yapp * rho
fval <- funceval(opt$loss, normj, yapp, rho, yrho, sumrho, cgamma, cgammab, cb, pr, C, reg_func)
grad <- gradient(opt$loss, yapp, yrho, rho, sumrho, cgamma, cgammab, cb, C, wdot, normj, pr, activeset)
switch(opt$optname, Newton = {
hess <- hessian(opt$loss, yapp, yrho, cgamma, cgammab, C, K, normj, wdot, pr, prox_deriv, activeset)
## find descent direction
## desc <- tryCatch( { -solve(hess,grad) }, error = function(err) { ##if the matrix is ill-conditioned ## add some regularization
## as.vector(-qr.solve(hess+diag(1e-06,N,N),grad))
## })
desc <- as.vector(-qr.solve(hess + diag(1e-07, N, N), grad))
names(desc) <- names(grad)
## check descent direction is valid
graddotd <- grad %*% desc
if (graddotd > 0) {
## revert to steepest descent
desc <- -grad
}
}, BFGS = {
desc <- as.vector(-hess %*% grad)
graddotd <- grad %*% desc
if (step > 1) {
if (graddotd >= 0) {
desc <- -grad
hess <- diag(1, N)
}
deltarho <- rho - prevrho
deltagrad <- grad - prevgrad
deltarhograd <- (deltagrad %*% deltarho)[, 1]
hessmult <- diag(1, N) - tcrossprod(deltagrad, deltarho)/deltarhograd
hess <- hessmult %*% hess %*% t(hessmult) + tcrossprod(deltagrad, deltagrad)/deltarhograd
}
prevrho <- rho
prevgrad <- grad
})
## store pre-line search values for later use
old <- list(fval = fval, grad = grad, rho = rho, normj = normj, wdot = wdot, krnlWPrIn = krnlWPrIn, activeset = activeset, yrho = yrho)
## update rho with a full step size
ss <- 1 #step size
rho <- old$rho + ss * desc
## special case for logit - need to confirm yrho is in the bounds specified for the logit convex conjugate (see SpicyMKL Tomioka Suzuki JMLR 2011 eqn 16)
if (opt$loss == "logit") {
yrho <- rho * yapp
if (any(yrho <= -1 | yrho >= 0)) {
yd <- yapp * desc
ss_mod <- 0.99 * min(c((-1 - old$yrho[yd < 0])/yd[yd < 0], -old$yrho[yd > 0]/yd[yd > 0]))
ss <- min(ss, ss_mod)
## re-compute rho update with new step size
rho <- old$rho + ss * desc
}
}
## update rest of variables this update is needed to get the initial base obj function value
krnlWPrIn <- matrix(rho, length(rho), M) + krnlWMod
expand(kernel_norm_cpp(K, krnlWPrIn), c("wdot", "normj"))
pr <- prox(normj * cgamma, C, cgamma)
activeset <- which(pr > 0)
sumrho <- sum(rho)
yrho <- yapp * rho
fval <- funceval(opt$loss, normj, yapp, rho, yrho, sumrho, cgamma, cgammab, cb, pr, C, reg_func)
## compute step length in descent direction via Armijo's rule can be moved to a separate function, but calculation depends on too many arguments, some of
## which large - performance hit for slight increase in modularity
dir <- rho - old$rho
tmpActiveset <- union(activeset, old$activeset)
actdif <- setdiff(tmpActiveset, activeset)
if (length(actdif) > 0) {
old$krnlWPrIn[, actdif] <- matrix(old$rho, length(old$rho), length(actdif)) + krnlWMod[, actdif]
## need to update portions of old$wdot and old$normj there might be a more efficient method to do it
tmp1 <- kernel_norm_cpp(K, old$krnlWPrIn, actdif)
old$wdot[, actdif] <- tmp1$wdot
old$normj[actdif] <- tmp1$normj
rm(tmp1)
}
## calculate normj and wdot including the directional component intermediate calcs for computing normj(alpha+rho+dir*steplen) from normj(alpha+rho) (the
## latter is what we had originally)
dirNorm <- 0 * normj
dirDotWDot <- 0 * normj
dirNorm[tmpActiveset] <- dir %*% (wdot[, tmpActiveset] - old$wdot[, tmpActiveset])
dirDotWDot[tmpActiveset] <- dir %*% old$wdot[, tmpActiveset]
## Wolfe conditions
stepL <- 1
while (fval > old$fval + stepL * 0.01 * (dir %*% old$grad) && (grad %*% dir < 0.9 * old$grad %*% dir || stepL == 1) && stepL > 0) {
## while ( fval> old$fval + stepL*0.01*(dir%*%old$grad) && stepL > 0) {
stepL <- stepL/2
rho <- old$rho + stepL * dir
## all variables having rho as a component need to be updated prior to the funceval call
normj <- 0 * normj
## max is to avoid sqrt of (slightly) negative numbers
normj[tmpActiveset] <- sqrt(pmax(0, old$normj[tmpActiveset]^2 + 2 * stepL * dirDotWDot[tmpActiveset] + (stepL^2) * dirNorm[tmpActiveset]))
pr <- prox(normj * cgamma, C, cgamma)
fval <- funceval(opt$loss, normj, yapp, rho, yapp * rho, sum(rho), cgamma, cgammab, cb, pr, C, reg_func)
sumrho <- sum(rho)
yrho <- yapp * rho
activeset <- which(pr > 0)
grad <- gradient(opt$loss, yapp, yrho, rho, sumrho, cgamma, cgammab, cb, C, wdot, normj, pr, activeset)
}
## end of Wolfe conditions line search
## in case armijo rule cut step length update rho-dependent variables not used in Armijo step size calculation
if (stepL != 1) {
activeset <- which(pr > 0)
krnlWPrIn <- matrix(rho, length(rho), M) + krnlWMod
wdot[, activeset] <- kernel_norm_cpp(K, krnlWPrIn, activeset)$wdot
sumrho <- sum(rho)
yrho <- yapp * rho
}
## if fval is unexpected, print max singular value of wdot
if (is.complex(fval)) {
stop(paste0("Evaluation of the dual resulted in imaginary numbers. The largest singular value of wdot is ", round(norm(wdot, "2"), digits = 3)))
}
if (opt$display >= 3) {
write(paste0("inner iter: ", step, " fval: ", fval, " step length: ", stepL), stderr())
}
if (sqrt((old$rho - rho) %*% (old$rho - rho))/sqrt(old$rho %*% old$rho) <= opt$tol_inner) {
break
}
if (step == opt$inner_maxiter) {
warning(paste0("Inner optimization did not converge to an optimal solution for rho. Increase iterations to more than ", opt$inner_maxiter),
immediate. = TRUE)
return(list(comb_alpha = numeric(), kern_weight = numeric(), sorted_kern_weight = numeric(), bias = numeric(), activeset = numeric(), history = history,
opt = opt))
}
}
## end of inner loop (finding optimal value of rho)
## primal vaiable update
krnlWMod <- krnlWMod * 0
if (length(activeset) > 0) {
## this follows from the discussion in section 4.2 of Suzuki (2011) SpicyMKL see the first pop-up note with the relevant derivations
krnlWMod[, activeset] <- krnlWPrIn[, activeset] * matrix(((pr[activeset]/normj[activeset])/cgamma[activeset]), N, length(activeset), byrow = TRUE)
}
cb <- cb + cgammab * sumrho
## Duality Gap calculations ####################### these are the actual vectors of kernel weights now (alpha in in SpicyMKL Tomioka Suzuki JMLR 2011)
krnlW <- -krnlWMod * matrix(cgamma, N, M, byrow = TRUE)
krnlWNorm <- kernel_norm_cpp(K, krnlW, activeset)$normj
## see second paragraph of section 6.1 in SpicyMKL Tomioka Suzuki JMLR 2011
rhoMod <- rho - oneN * (sum(rho)/N)
rhoNorm <- kernel_norm_cpp(K, matrix(rhoMod, N, M), activeset)$normj
if (length(activeset) > 0 && opt$regname == "l1") {
## see second paragraph of section 6.1 in SpicyMKL Tomioka Suzuki JMLR 2011
rhoMod <- rhoMod * min(1, C[1]/max(rhoNorm))
rhoNorm <- rhoNorm * min(1, C[1]/max(rhoNorm))
}
primalArg <- rep(-cb, N)
for (i in activeset) {
primalArg <- primalArg + K[, , i] %*% krnlW[, i]
}
primalVal <- primal_obj(opt$loss, yapp, primalArg, length(activeset), reg_func, krnlWNorm, C)
dualVal <- dual_obj(opt$loss, yapp, rhoMod, rhoNorm, reg_dual, C)
dualGap <- if (is.infinite(primalVal))
Inf else abs(primalVal - dualVal)/abs(primalVal)
nowTime <- Sys.time()
elpsdTime <- elpsdTime + as.numeric(nowTime - oldTime)
oldTime <- nowTime
## update history data object
history <- rbind(history, setNames(list(primalVal, dualVal, length(activeset), dualGap, elpsdTime), names(history)))
if (opt$display > 1) {
write(paste0("outer iter: ", l, " primal: ", primalVal, " dual: ", dualVal, " duality_gap: ", dualGap), stderr())
}
## Duality Gap stopping criterion#######################
if (dualGap < opt$tol_outer && opt$stop_duality_gap) {
break
}
## end Duality Gap stopping ###########################
if (opt$stop_duality_gap && nrow(history) > opt$miniter && (history[step - opt$miniter, "dualityGap"] - dualGap)/history[step - opt$miniter, "dualityGap"] <
opt$tol_miniter) {
warning("The duality gap has been closing very slowly indicating slow convergence.You should examine your kernels for multicollinearity and or change regularization parameters.Alternatively you can increase miniter or decrease tol_miniter.",
immediate. = TRUE)
return(list(comb_alpha = numeric(), kern_weight = numeric(), sorted_kern_weight = numeric(), bias = numeric(), activeset = numeric(), history = history,
opt = opt))
}
## End of Duality Gap calculations###################
## KKT stopping criterion####################### Also Update Augmented Lagrangian coefficients (cgamma and cgammab) ###
krnlWPrInKKT <- krnlWPrIn
if (length(activeset) > 0) {
krnlWPrInKKT[, activeset] <- krnlWPrIn[, activeset] * matrix(pmin(1, C[1]/normj[activeset]), N, length(activeset), byrow = TRUE)
}
## from MATLAB code hresid <- (normj*cgamma - pr).^2 i.e. hresid measures which proximal to the norm of a given kernel's weights show the biggest
## difference from the actual norm
hresid <- sqrt(colSums((krnlWPrInKKT - matrix(rho, N, M))^2))
maxgap <- 0
## tmp is how much cgammab is changing from one iteration to the next, it is used to determine how quickly to ramp up cgammab
tmp <- abs(sumrho)
## determine how quickly to adjust cgammab (the augmented Lagrangian coefficient of cb)
big_cb <- (tmp > cbeta * ck)
maxgap <- max(tmp, maxgap)
## compute which residuals are large enough ro require faster adjustment of the augmented Lagrangian coefficient cgamma for the respective kernel norm
largeRes <- which(hresid > cbeta * ck)
maxgap <- max(max(abs(hresid)), maxgap)
if (opt$display >= 2 && opt$stop_ineq_violation) {
write(paste0("outer iter: ", l, " maxgap: ", maxgap, " ck: ", ck), stderr())
}
if (maxgap <= ck) {
ck <- maxgap
}
if (ck < opt$tol_outer && opt$stop_ineq_violation) {
break
}
## end KKT stopping ###########################
## update cgammab according to how close we are to an optimal cgammab (determined by the size of the sumrho update relative to the size of the gaps
## between the kernel norm proximals and actual kernel norms - large update means we are still making big steps (i.e. we can move faster towards removing
## the AL term); smaller updates mean we are still moving in the vicinity of the current solution, which might be a result of a nearby non-smooth region
## -we should therefore maintain higher resolution in our AL elimination)
cgammab <- ifelse(big_cb, opt$calpha * cgammab, opt$calpha + cgammab)
# update cgammas
cgamma[largeRes] = opt$calpha * cgamma[largeRes]
krnlWMod[, largeRes] = krnlWMod[, largeRes]/opt$calpha
## smallRes kernel norms are the complement of the largeRes ones
smallRes <- setdiff(1:M, largeRes)
if (length(smallRes) > 0) {
cgamma[smallRes] = opt$calpha + cgamma[smallRes]
krnlWMod[, smallRes] = krnlWMod[, smallRes] * matrix((cgamma[smallRes] - opt$calpha)/cgamma[smallRes], N, length(smallRes), byrow = TRUE)
}
## End update Augmented Lagrangian coefficients (cgamma and cgammab) ###
## update all main variables
krnlWPrIn <- matrix(rho, N, M) + krnlWMod
expand(kernel_norm_cpp(K, krnlWPrIn), c("wdot", "normj"))
pr <- prox(normj * cgamma, C, cgamma)
activeset <- which(pr > 0)
## make sure user knows if algorithm exceeded outer_maxiter
if (step == opt$outer_maxiter) {
warning(paste0("Outer optimization did not converge to an optimal solution for alpha and b. Increase iterations to more than ", opt$outer_maxiter),
immediate. = TRUE)
return(list(comb_alpha = numeric(), kern_weight = numeric(), sorted_kern_weight = numeric(), bias = numeric(), activeset = numeric(), history = history,
opt = opt))
}
}
## end of outer loop
tmp <- rep(0, M)
tmp[activeset] <- pr[activeset]/normj[activeset]
kern_weight <- tmp/(1 - (tmp/cgamma))
sum_k_weight <- sum(kern_weight)
if (sum_k_weight != 0) {
kern_weight <- kern_weight/sum_k_weight
}
## you rescale rho so that the scaling of the kernel weights is negated and the model can be used directly without consideration about the kernel weight
## scaling (which is for visualization/ease of interpretation purposes really)
comb_alpha <- -rho * sum_k_weight
bias <- -cb
krnlW <- krnlWMod[, activeset] * matrix(cgamma[activeset], N, length(activeset), byrow = TRUE)
rownames(krnlW) <- names(rho)
colnames(krnlW) <- dimnames(K)[[3]][activeset]
names(comb_alpha) <- names(rho)
names(kern_weight) <- dimnames(K)[[3]]
names(activeset) <- dimnames(K)[[3]][activeset]
sorted_kern_weight <- sort(kern_weight[activeset], decreasing = T)
res <- list(comb_alpha = comb_alpha, kern_weight = kern_weight, sorted_kern_weight = sorted_kern_weight, bias = bias, activeset = activeset, idx.train = dimnames(K)[[1]],
history = history, opt = opt)
class(res) <- c("spicer", class(res))
if (opt$incl_subw)
res$kern_alpha <- krnlW
return(res)
}
## end of Spicer function
## Spicer Helpers#############################
## Evaluates the augmented dual proximal (in fact the negative of it since we ar eminimizing)
funceval <- function(loss, normj, yapp, rho, yrho, sumrho, cgamma, cgammab, cb, pr, C, reg_func) {
val <- switch(loss,
logit = logit_loss(yrho),
square = square_loss(yapp, rho))
## they use Proposition 1 (eqn 23) in SpicyMKL Tomioka Suzuki JMLR 2011 to convert Moreau envelope of the convex conjugate into moreau envelope of
## norm(alpha+gamma*rho)^2/2 - moreau envelope of regularization function the latter can then be evaluated at prox(norm(alpha+gamma*rho)) as in Boyd
## Promixal algorithms monograph, 3.1 , right before eqn 3.2
val <- val - (sum(reg_func(pr, C)) + sum(pr^2/(2 * cgamma)) - normj %*% pr)
val <- val + cgammab * sumrho^2/2 + cb * sumrho
return(val)
}
## Evaluates gradient of the augmented dual proximal objective function
gradient <- function(loss, yapp, yrho, rho, sumrho, cgamma, cgammab, cb, C, wdot, normj, pr, activeset) {
val <- switch(loss,
logit = logit_grad(yrho, yapp),
square = square_grad(rho, yapp))
for (i in activeset) {
val <- val + wdot[, i] * (pr[i]/normj[i])
}
val <- val + (cgammab * sumrho + cb)
return(val)
}
## Evaluates Hessian on augmented dual proximal objective function
hessian <- function(loss, yapp, yrho, cgamma, cgammab, C, K, normj, wdot, pr, prox_deriv, activeset) {
val <- switch(loss,
logit = logit_hess(yrho),
square = square_hess(length(yapp)))
dpr <- prox_deriv(normj * cgamma, C, cgamma)
w1 <- pr/normj
w2 <- (cgamma * dpr * normj - pr)/(normj^3)
for (i in activeset) {
val <- val + w1[i] * K[, , i]
val <- val + w2[i] * (wdot[, i] %*% t(wdot[, i]))
}
val <- val + cgammab
return(val)
}
dual_obj <- function(loss, yapp, rho, rhoNorm, reg_dual, C) {
dual <- switch(loss, logit = {
dual <- -logit_loss(yapp * rho) - sum(reg_dual(rhoNorm, C))
}, square = {
dual <- -square_loss(yapp, rho) - sum(reg_dual(rhoNorm, C))
})
return(dual)
}
## Primal Obj Funcitons########################### z here has the opposite sign to the MATLAB code (but the right one theoretically) - the opposite sign
## is compensated for by negating z in the argument pre-processing
primal_obj <- function(loss, yapp, z, nactive, reg_func, krnlWNorm, C) {
primal <- switch(loss,
logit = sum(log(1 + exp(-yapp * z))),
square = sum((yapp - z)^2) * 0.5)
if (nactive > 0) {
primal <- primal + sum(reg_func(krnlWNorm, C))
}
return(primal)
}
## for the logit implementation, yrho needs to stay which (0,1), so
##-yrho needs to stay within (-1,0) (all open intervals!!)
check_yrho <- function(yrho) {
## this is to make sure yrho falls within the required constraint
##-1<-yrho<0,
## which is the negated actual constraint 0>yrho>1
pmin(pmax(yrho, -1 + 1e-07), -1e-07)
}
## Loss functions################################################### Dual Loss Functions############################## all functions below are actually
## related to the dual of the loss function (aka convex conjugate), i.e. *.loss is the evaluation of the dual of the respective loss, *.grad evaluate the
## gradient of the dual of the loss *.hess evaluates the Hessian of the dual of the loss the dual variable is rho, which is the lagrangian of the
## K*alpha+beta (pseudo) constraint - it is Nx1 dimensional (i.e. rho is per-sample, computed across all kernels)
## Logit Loss#######################################
## aka dual/convex conjugate of logit loss see see table 1 in Tomioka 2009 - Dual Augmented Lagrangian also Suzuki 2011 Spicy MKL eqn(16) difference is
## that negative sign of rho is accounted for in function calculation
logit_loss <- function(yrho) {
yrho<-check_yrho(yrho)
loss <- sum((1 + yrho) * log(1 + yrho) - yrho * log(-yrho))
return(loss)
}
## different from table 1 in Tomioka 2009 - Dual Augmented Lagrangian again because of the substitution of rho for negative rho also this is technically
## -logit_grad, even with the variable substitution!!
logit_grad <- function(yrho, yapp) {
yrho<-check_yrho(yrho)
grad <- yapp * log((1 + yrho)/(-yrho))
return(grad)
}
## remember, the logit uses binary {-1,1} y labels, so the y^2 term that should have been in the numerator disappears!!
logit_hess <- function(yrho) {
hess <- diag(1/(-yrho * (1 + yrho)))
return(hess)
}
## SVM Loss################################### Suzuki 2011 Spicy MKL eqn(17)
svm_loss <- function(yrho) {
loss <- sum(yrho)
return(loss)
}
## Square Loss################
square_loss <- function(yapp, rho) {
loss <- 0.5 * rho %*% rho + rho %*% yapp
return(loss)
}
square_grad <- function(rho, yapp) {
grad <- rho + yapp
return(grad)
}
square_hess <- function(N) {
hess <- diag(nrow = N)
return(hess)
}
## regularization terms############################################ the proximal operators are vectorized
## regularization function factory######
get_reg_funcs <- function(regname = c("l1", "elasticnet")) {
regname = match.arg(regname)
switch(regname, l1 = list(reg_func = l1_reg, reg_dual = l1_dual, prox = l1_prox, prox_deriv = l1_prox_deriv), elasticnet = list(reg_func = elas_reg,
reg_dual = elas_dual, prox = elas_prox, prox_deriv = elas_prox_deriv))
}
## L1-norm#######################################
l1_reg <- function(x, C) {
reg <- C * abs(x)
return(reg)
}
## aka convex conjugate see Suzuki (2011) SpicyMKL -eqn (21)
l1_dual <- function(x, C) {
regDual <- rep(0, length(x))
regDual[x > C] <- Inf
return(regDual)
}
## see table 2 in Tomioka 2009 - Dual Augmented Lagrangian can also be derived with Moreau decomposition and projection on the Linf-norm ball - see Boyd
## Proximal Algorithms monograph, section 6.5.1
l1_prox <- function(x, C, eta) {
prox <- pmax(x - C * eta, 0)
return(prox)
}
## derivative of l1 proximal function
l1_prox_deriv <- function(x, C, eta) {
proxDeriv <- as.numeric(x > C * eta)
return(proxDeriv)
}
## Elastic Net Regularization##################################
elas_reg <- function(x, C) {
reg <- C[1] * abs(x) + (C[2]/2) * (x^2)
return(reg)
}
## aka convex conjugate see Suzuki(2011) -SpicyMKL eqn(46)
elas_dual <- function(x, C) {
regDual <- (0.5/C[2]) * (x^2 - 2 * C[1] * abs(x) + C[1]^2) * (x > C[1])
return(regDual)
}
## see see Boyd Proximal Algorithms monograph, section 6.5.3
elas_prox <- function(x, C, eta) {
prox <- pmax(x - C[1] * eta, 0)/(1 + C[2] * eta)
return(prox)
}
elas_prox_deriv <- function(x, C, eta) {
proxDeriv <- (1/(1 + C[2] * eta)) * (x > C[1] * eta)
return(proxDeriv)
}
#####################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.