R/funcs.R

Defines functions hierNet print.hierNet print.hierNet.path print.hierNet.cv hierNet.path predict.hierNet predict.hierNet.path admm4 Objective Objective.logistic compute.interactions.c compute.full.interactions.c Compute.yhat.c Compute.phat.c ggdescent.c hierNet.logistic admm4.logistic ggdescent.logistic ADMM4.Lagrangian predict.hierNet.logistic critf.logistic twonorm balanced.folds permute.rows hierNet.cv plot.hierNet.cv error.bars hierNet.varimp

Documented in compute.interactions.c critf.logistic hierNet hierNet.cv hierNet.logistic hierNet.path hierNet.varimp Objective Objective.logistic plot.hierNet.cv predict.hierNet predict.hierNet.logistic predict.hierNet.path print.hierNet print.hierNet.cv print.hierNet.path

hierNet <- function(x, y, lam, delta=1e-8, strong=FALSE, diagonal=TRUE, aa=NULL, zz=NULL, center=TRUE, stand.main=TRUE, stand.int=FALSE, 
                    rho=nrow(x), niter=100, sym.eps=1e-3,
                    step=1, maxiter=2000, backtrack=0.2, tol=1e-5,
                    trace=0) {
  # Main Hiernet function for fitting at a single parameter lambda.
  # Note: L1 penalty terms have parameter lam.l1 = lambda * (1-delta)
  #       and L2 penalty has parameter lam.l2 = lambda * delta.
  #
  # stand.main and stand.int refer to scaling
  stopifnot(nrow(x) == length(y), lam >= 0, delta >= 0, delta <= 1)
  stopifnot(!is.null(step) && !is.null(maxiter))
  if (strong) stopifnot(!is.null(niter))
  stopifnot(class(y) == "numeric")
  stopifnot(class(lam) == "numeric")
  stopifnot(class(delta) == "numeric")
  stopifnot(class(step) == "numeric", step > 0, maxiter > 0)
  stopifnot(is.finite(x), is.finite(y), is.finite(lam), is.finite(delta))
  this.call <- match.call()

  if (!center) cat("WARNING: center=FALSE should almost never be used.  This option is available for special uses only.", fill=TRUE)
  # center and (maybe) scale variables
  x <- scale(x, center=center, scale=stand.main)
  mx <- attr(x, "scaled:center")
  sx <- attr(x, "scaled:scale") # may be NULL
  if (center) {
    my <- mean(y)
    y <- y - my
  } else my <- NULL
  
  if (is.null(zz)) {
    if (trace > 0) cat("Computing zz...", fill=TRUE)
    zz <- compute.interactions.c(x, diagonal=diagonal)
  }
  if (is.matrix(zz)) {
    zz <- scale(zz, center=center, scale=stand.int)
    mzz <- attr(zz, "scaled:center")
    szz <- attr(zz, "scaled:scale") # may be NULL
    zz <- as.numeric(zz)
  } else {
    mzz <- szz <- NULL
    #cat("Provided zz is not a matrix, so it's assumed to be already centered.", fill=TRUE)
  }

  xnum <- as.numeric(x)
  p <- ncol(x)
  lam.l1 <- lam * (1 - delta)
  lam.l2 <- lam * delta
  if (strong) {
    # strong hierarchy -- use ADMM4
    if (is.null(rho)) rho <- as.numeric(nrow(x))
    stopifnot(is.numeric(rho), is.finite(rho))
    aa <- admm4(x, xnum, y, lam.l1=lam.l1, lam.l2=lam.l2, diagonal=diagonal, zz=zz,
                 rho=rho, niter=niter, aa=aa, sym.eps=sym.eps, # ADMM params
                 stepsize=step, backtrack=backtrack, maxiter=maxiter, tol=tol, # GG params
                 trace=trace)
    # lack of symmetry in theta means that sometimes strong hierarchy will be (very slightly violated)
    ii <- aa$bp + aa$bn == 0
    # note aa$th[ii, ] = 0 since weak hierarchy holds for sure
    if (sum(ii) > 0 & sum(ii) < p) {
      thr <- max(abs(aa$th[!ii, ii]))
      if (thr > 0) {
        cat("  thr = ",thr, fill=TRUE)
        if (thr > 1e-3)
          warning("Had to change ADMM's 'th' by more than 0.001 to make strong hier hold! Increase niter (and/or rho). ")
        aa$th[abs(aa$th) <= thr] <- 0
      }
    }
  } else {
    # weak hierarchy -- a single call to generalized gradient descent
    if (is.null(aa)) {
      aa <- list(th=matrix(0, p, p), bp=rep(0, p), bn=rep(0, p))
    } else {
      stopifnot(dim(aa$th) == c(p,p), length(aa$bp) == p, length(aa$bn) == p)
    }
    # this could be improved by not actually creating V...
    V <- matrix(0, p, p)
    rho <- 0
    aa <- ggdescent.c(x=x, xnum=xnum, zz=zz, y=y, lam.l1=lam.l1, lam.l2=lam.l2, diagonal=diagonal,
       	  	      rho=rho, V=V,
                      stepsize=step, backtrack=backtrack, maxiter=maxiter, tol=tol,
                      aa=aa, trace=trace)
  }

  aa$lam <- lam
  aa$delta <- delta
  aa$type <- "gaussian"
  aa$diagonal <- diagonal
  aa$strong <- strong
  aa$obj <- Objective(aa=aa, x=x, y=y, lam.l1=lam.l1, lam.l2=lam.l2, xnum=xnum, zz=zz, strong=strong)
  aa$step <- step
  aa$maxiter <- maxiter
  aa$backtrack <- backtrack
  aa$tol <- tol
  if (strong) {
    # ADMM parameters:
    aa$rho <- rho
    aa$niter <- niter
    aa$sym.eps <- sym.eps
  }
  aa$mx <- mx
  aa$sx <- sx
  aa$my <- my
  aa$mzz <- mzz
  aa$szz <- szz
  aa$call <- this.call
  class(aa) <- "hierNet"
  return(aa)
}

print.hierNet <- function(x, ...) {
  cat("Call:\n")
  dput(x$call)
  th=(x$th+t(x$th))/2
  o2=colSums(th^2)!=0
  b=x$bp-x$bn
  o=b!=0
  b=b[o]
  if (any(o2)) {
    # model has interactions
    th=th[o,o2,drop=FALSE]
    tight <- rowSums(abs(th)) >= x$bp[o] + x$bn[o] - 1e-9
    tt <- rep("", length(tight))
    tt[tight] <- "*"
    mat=cbind(b,th)
    mat=round(mat,4)
    mat <- cbind(mat, tt)
    cat("\n")
    cat("Non-zero coefficients:",fill=T)
    cat("  (Rows are predictors with nonzero main effects)",fill=T)
    cat("  (1st column is main effect)", fill=T)
    cat("  (Next columns are nonzero interactions of row predictor)", fill=T)
    cat("  (Last column indicates whether hierarchy constraint is tight.)",fill=T)
    cat("\n")
    dimnames(mat)=list(as.character(which(o)),c("Main effect",as.character(which(o2)),"Tight?"))
    print(mat, quote = FALSE)
  } else {
    mat <- matrix(round(b,4), length(b), 1)
    cat("\n")
    cat("Non-zero coefficients:",fill=T)
    cat("  (No interactions in this model)",fill=T)
    cat("\n")
    dimnames(mat)=list(as.character(which(o)),"Main effect")
    print(mat, quote = FALSE)    
  }
  invisible()
}

print.hierNet.path <- function(x, ...) {
  cat("Call:\n")
  dput(x$call)
  b=x$bp-x$bn
  mat=cbind(round(x$lam,2),round(x$obj,2),colSums(b!=0),apply(x$th!=0,3,function(a) sum(diag(a)) + sum((a+t(a)!=0)[upper.tri(a)])))

  dimnames(mat)=list(NULL,c("Lambda", "Objective", "Number of main effects","Number of interactions"))
  cat("\n")
  print(mat, quote = FALSE)
  invisible()
}

print.hierNet.cv <- function(x, ...) {
  cat("Call:\n")
  dput(x$call)
mat=cbind(round(x$lamlist,2),x$nonzero,round(x$cv.err,2),round(x$cv.se,2))

  dimnames(mat)=list(NULL,c("Lambda", "Number of nonzero","Mean CV error", "SE"))
 cat("\n")
  print(mat, quote = FALSE)
 cat("\n")
cat(c("lamhat=",round(x$lamhat,2),"lamhat.1se=",round(x$lamhat.1se,2)),fill=T)

  invisible()
}


hierNet.path <- function(x, y, lamlist=NULL, delta=1e-8, minlam=NULL, maxlam=NULL, nlam=20, flmin=.01,
                         diagonal=TRUE, strong=FALSE, aa=NULL, zz=NULL,
                         stand.main=TRUE, stand.int=FALSE,
                         rho=nrow(x), niter=100, sym.eps=1e-3,# ADMM params
                         step=1, maxiter=2000, backtrack=0.2, tol=1e-5, # GG descent params
                         trace=0) {
  # Main Hiernet function for fitting at a sequence of lambda values.
  # Note: L1 penalty terms have parameter lam.l1 = lambda * (1-delta)
  #       and L2 penalty has parameter lam.l2 = lambda * delta.
  #
  # Always centers both x and zz (unless zz is provided in as.numeric form)
  # stand.main and stand.int refer to whether main effects and interactions should have norm sqrt(n-1)
  
  # center and (maybe) scale variables
  this.call <- match.call()
  x <- scale(x, center=TRUE, scale=stand.main)
  mx <- attr(x, "scaled:center")
  sx <- attr(x, "scaled:scale") # may be NULL
  my <- mean(y)
  y <- y - my
  
  if (is.null(maxlam)) {
    if (!is.null(minlam)) stop("Cannot have maxlam=NULL if minlam is non-null.")
  #  maxlam <- max(abs(t(x) %*% y)/colSums(x^2))
    maxlam <- max(abs(t(x) %*% y))
  #  temp <- t(scale(t(x), center=FALSE, scale=1/y))
  #  temp2 <- apply(temp, 2, twonorm)
  #  maxlam <- max(max(temp2), maxlam)
    minlam <- maxlam * flmin
  }
  if (is.null(minlam)) minlam <- maxlam * flmin
  if (is.null(lamlist))
    lamlist <- exp(seq(log(maxlam),log(minlam),length=nlam))
  nlam <- length(lamlist)
  
  if (is.null(zz))
    zz <- compute.interactions.c(x, diagonal=diagonal)
  else
    stopifnot(is.matrix(zz))
  
  # center and (maybe) scale zz
  zz <- scale(zz, center=TRUE, scale=stand.int)
  mzz <- attr(zz, "scaled:center")
  szz <- attr(zz, "scaled:scale") # may be NULL
  
  zz <- as.numeric(zz)
  p <- ncol(x)
  cp2 <- choose(p, 2)
  bp <- bn <- matrix(NA, nrow=p, ncol=nlam)
  th <- array(NA, c(p, p, nlam))
  obj <- rep(NA, nlam)
  aa <- NULL
  for (i in seq(nlam)) {
    cat(c("i,lam=", i, round(lamlist[i],2)), fill=TRUE)
    aa <- hierNet(x, y, lam=lamlist[i], delta=delta, strong=strong, diagonal=diagonal, aa=aa, zz=zz,
                  stand.main=FALSE, stand.int=FALSE, # have already standardized
                  rho=rho, niter=niter, sym.eps=sym.eps,
                  step=step, maxiter=maxiter, backtrack=backtrack, tol=tol, trace=trace)
    bp[, i] <- aa$bp
    bn[, i] <- aa$bn
    th[, , i] <- aa$th
    obj[i] <- aa$obj
  }
  dimnames(bp) <- dimnames(bn) <- list(as.character(1:p), NULL)
  dimnames(th) <- list(as.character(1:p), as.character(1:p), NULL)

  out <- list(bp=bp, bn=bn, th=th, obj=obj, lamlist=lamlist, delta=delta, mx=mx, sx=sx, mzz=mzz, szz=szz, my=my,
              type="gaussian", diagonal=diagonal, strong=strong,
              step=step, maxiter=maxiter, backtrack=backtrack, tol=tol,    
              call=this.call)
  if (strong) {
    # ADMM parameters:
    out$rho <- rho
    out$niter <- niter
    out$sym.eps <- sym.eps
  }
  class(out) <- "hierNet.path"
  out
}

predict.hierNet <- function(object, newx, newzz=NULL, ...) {
  n <- nrow(newx)
  if (is.null(object$sx))
    newx <- scale(newx, center=object$mx, scale=FALSE)
  else
    newx <- scale(newx, center=object$mx, scale=object$sx)    
  if (is.null(newzz))
    newzz <- compute.interactions.c(newx, diagonal=object$diagonal)
  if (is.null(object$szz))
    newzz <- scale(newzz, center=object$mzz, scale=FALSE)
  else
    newzz <- scale(newzz, center=object$mzz, scale=object$szz)
  newzz <- as.numeric(newzz)
  newx <- as.numeric(newx)
  stopifnot(is.finite(newzz), is.finite(newx))
  if (class(object$bp) == "numeric")
    yhatt <- Compute.yhat.c(newx, newzz, object) + object$my
  else {
    nlam <- ncol(object$bp)
    yhat <- matrix(NA, n, nlam)
    # this could be made more efficient
    for (i in seq(nlam)) {
      bb <- list(bp=object$bp[, i], bn=object$bn[, i], th=object$th[, , i], diagonal=object$diagonal)
      yhat[, i] <- Compute.yhat.c(newx, newzz, bb)
    }
    yhatt <- yhat + object$my
  }
  if (object$type == "logistic") {
    # predict from hierNet.logistic object object
    b0 <- object$b0
    if(is.matrix(yhatt))
      b0 <- matrix(b0, nrow=nrow(yhatt), ncol=ncol(yhatt), byrow=T)
    yhatt <- b0 + yhatt
    pr <- 1 / (1 + exp(-yhatt))
    return(list(prob=pr, yhat=1*(pr>.5)))
  }
  return(yhatt)
}

predict.hierNet.path <- function(object, newx, newzz=NULL, ...){
 predict.hierNet(object, newx, newzz, ...)
}

admm4 <- function(x, xnum, y, lam.l1, lam.l2, diagonal, zz=NULL, rho, niter, aa=NULL, sym.eps=1e-3, trace=1, ...) {
  # Performs ADMM4.
  # Note: xnum is the matrix x as a numeric.  Both are passed to avoid having to call as.numeric too
  # many times.
  p <- ncol(x)
  if (is.null(zz)) {
    if (trace > 0) cat("Computing zz...", fill=TRUE)
    zz <- as.numeric(compute.interactions.c(x, diagonal=diagonal))
  }
  else if (class(zz) == "matrix") zz <- as.numeric(zz)
  if (is.null(aa)) {
    aa <- list(u=matrix(0, p, p),
               th=matrix(0, p, p),
               bp=rep(0, p),
               bn=rep(0, p),
               tt=matrix(0, p, p),
               diagonal=diagonal)
  } else {
    stopifnot(diagonal == aa$diagonal)
  }
  if (is.null(aa$tt) || is.null(aa$u)) {
    aa$tt <- 0.5 * (aa$th + t(aa$th))
    aa$u <- matrix(0, p, p)
  }
  obj <- Objective(aa=aa, x=x, y=y, lam.l1=lam.l1, lam.l2=lam.l2, xnum=xnum, zz=zz, strong=TRUE, sym.eps=sym.eps)
  ll <- NULL
  for (i in seq(niter)) {
    if (trace > 0) cat(i, " ")
    ll <- c(ll, ADMM4.Lagrangian(aa, xnum, zz, y, lam.l1=lam.l1, lam.l2=lam.l2, diagonal=diagonal, rho))
    V <- aa$u - rho * aa$tt
    gg <- ggdescent.c(x, xnum, zz, y, lam.l1=lam.l1, lam.l2=lam.l2, diagonal=diagonal, rho, V, trace=trace-1, aa=aa, ...)
    aa$th <- gg$th
    aa$bp <- gg$bp
    aa$bn <- gg$bn
    aa$tt <- (aa$th + t(aa$th)) / 2 + (aa$u + t(aa$u)) / (2 * rho)
    aa$u <- aa$u + rho * (aa$th - aa$tt)
    obj <- c(obj, Objective(aa=aa, x=x, y=y, lam.l1=lam.l1, lam.l2=lam.l2, xnum=xnum, zz=zz, strong=TRUE, sym.eps=sym.eps))
    if (trace > 0) cat(obj[i+1], fill=TRUE)
  }
  if (max(abs(aa$th-t(aa$th))) > sym.eps)
    cat("Attention: th not symmetric within the desired sym.eps.  Run ADMM for more iterations. And try increasing rho.")
  aa$obj <- obj
  aa$lagr <- ll
  aa
}

Objective <- function(aa, x, y, lam.l1, lam.l2, xnum=NULL, zz=NULL, strong=TRUE, sym.eps=1e-3) {
  # evaluates the NewYal objective at aa.
  if (strong) {
    if (max(aa$th-t(aa$th)) > sym.eps) {
      cat("Theta is not symmetric.", fill=TRUE)
      return(Inf)
    }
  }
  if (any(rowSums(abs(aa$th)) > aa$bp + aa$bn + 1e-5)) {
    cat("hierarchy violated.", fill=TRUE)
    return(Inf)
  }
  if (any(aa$bp < -1e-5)||any(aa$bn < -1e-5)) {
    cat("Non-negative of bp or bn violated.", fill=TRUE)
    return(Inf)
  }
  if (aa$diagonal == FALSE)
    if (any(abs(diag(aa$th)) > 1e-8)) {
      cat("Zero diagonal violated.", fill=TRUE)
      return(Inf)
    }
  if (is.null(zz)) {
    zz <- as.numeric(compute.interactions.c(x, diagonal=aa$diagonal))
  }
  if (is.null(xnum)) xnum <- as.numeric(x)
  r <- y - Compute.yhat.c(xnum, zz, aa)
  pen <- lam.l1 * sum(aa$bp + aa$bn) + lam.l1 * sum(abs(aa$th))/2 + lam.l1 * sum(abs(diag(aa$th)))/2
  pen <- pen + lam.l2 * (sum(aa$bp^2) + sum(aa$bn^2) + sum(aa$th^2))
  sum(r^2)/2 + pen
}

Objective.logistic <- function(aa, x, y, lam.l1, lam.l2, xnum=NULL, zz=NULL, strong=TRUE, sym.eps=1e-3) {
  # evaluates the logistic hiernet objective at aa.
  stopifnot(y %in% c(0,1))
  stopifnot("diagonal" %in% names(aa))
  if (aa$diagonal == FALSE)
    if (any(abs(diag(aa$th)) > 1e-8)) {
      cat("Diagonal of Theta is nonzero.", fill=TRUE)
      return(Inf)
    }
  if (strong) {
    if (max(aa$th-t(aa$th)) > sym.eps) {
      cat("Theta is not symmetric.", fill=TRUE)
      return(Inf)
    }
  }
  if (any(rowSums(abs(aa$th)) > aa$bp + aa$bn + 1e-5)) {
    cat("hierarchy violated.", fill=TRUE)
    return(Inf)
  }
  if (any(aa$bp < -1e5)||any(aa$bn < -1e5)) {
    cat("Non-negative of bp or bn violated.", fill=TRUE)
    return(Inf)
  }
  if (is.null(zz)) {
    zz <- as.numeric(scale(compute.interactions.c(x, diagonal=aa$diagonal), center=TRUE, scale=FALSE))
  }
  if (is.matrix(zz)) zz <- as.numeric(zz)
  if (is.null(xnum)) xnum <- as.numeric(x)
  phat <- Compute.phat.c(xnum, zz, aa)
  loss <- -sum(y*log(phat)) - sum((1-y)*log(1-phat))
  pen <- lam.l1 * sum(aa$bp + aa$bn) + lam.l1 * sum(abs(aa$th))/2 + lam.l1 * sum(abs(diag(aa$th)))/2
  pen <- pen + lam.l2 * (sum(aa$bp^2) + sum(aa$bn^2) + sum(aa$th^2))
  loss + pen
}

compute.interactions.c <- function(x, diagonal=TRUE) {
  # Returns (uncentered) n by cp2 matrix of interactions.
  # The columns of zz are in standard order (11), 12,13,14,...,(22),23,...
  # z's (jk)th column is x_j * x_k
  n <- nrow(x)
  p <- ncol(x)
  cp2 <- p * (p - 1) / 2
  if (diagonal) {
    cp2 <- cp2 + p
    out <- .C("ComputeInteractionsWithDiagWithIndices",
              as.double(x),
              as.integer(n),
              as.integer(p),
              z=rep(0, n * cp2),
              i1=as.integer(rep(0, cp2)),
              i2=as.integer(rep(0, cp2)), dupl = FALSE, PACKAGE="hierNet")
  }
  else {
    out <- .C("ComputeInteractionsWithIndices",
              as.double(x),
              as.integer(n),
              as.integer(p),
              z=rep(0, n * cp2),
              i1=as.integer(rep(0, cp2)),
              i2=as.integer(rep(0, cp2)), dupl = FALSE, PACKAGE="hierNet")
  }
  z <- matrix(out$z, n, cp2)
  rownames(z) <- rownames(x)
  if (is.null(colnames(x))) {
    colnames(z) <- paste(out$i1, out$i2, sep=":")
  }
  else {
    colnames(z) <- paste(colnames(x)[out$i1], colnames(x)[out$i2], sep=":")
  }
  z
}

compute.full.interactions.c <- function(x) {
  # Returns (uncentered) n by p^2 matrix of interactions.
  # The columns of zz are in standard order 11,12,13,14,...,23,...
  # z's (jk)th column is x_j * x_k
  n <- nrow(x)
  p <- ncol(x)
  out <- .C("ComputeFullInteractions",
            as.double(x),
            as.integer(n),
            as.integer(p),
            z=rep(0, n * p^2),
            dupl = FALSE, PACKAGE="hierNet")
  matrix(out$z, n, p^2)
}


Compute.yhat.c <- function(xnum, zz, aa) {
  # aa: list containing bp, bn, th, diagonal
  # note: zz is the n by cp2 matrix, whereas z is the n by p^2 one.
  p <- length(aa$bp)
  n <- length(xnum) / p
  stopifnot(n==round(n))
  stopifnot("diagonal" %in% names(aa))
  if (aa$diagonal) stopifnot(length(zz) == n * (choose(p,2) + p))
  else stopifnot(length(zz) == n * choose(p,2))

  out <- .C("compute_yhat_zz_R",
            xnum,
            as.integer(n),
            as.integer(p),
            zz,
            as.integer(aa$diagonal),
            as.double(aa$th),
            aa$bp,
            aa$bn,
            yhat=rep(0, n),
            DUP=FALSE, PACKAGE="hierNet")
  out$yhat
}

Compute.phat.c <- function(xnum, zz, aa) {
  # aa: list containing b0, bp, bn, th
  # note: zz is the n by cp2 matrix, whereas z is the n by p^2 one.
  stopifnot(c("b0","bp","bn","th","diagonal") %in% names(aa))
  p <- length(aa$bp)
  n <- length(xnum) / p
  if (is.matrix(xnum)) xnum <- as.numeric(xnum)
  stopifnot(n == round(n))
  if (aa$diagonal) stopifnot(length(zz) == n * (choose(p,2) + p))
  else stopifnot(length(zz) == n * choose(p,2))
  #void compute_phat_zz_R(double *x, int *n, int *p, double *zz, int *diagonal,
  #		       double *b0, double *th, double *bp, double *bn, double *phat) {
  out <- .C("compute_phat_zz_R",
            xnum,
            as.integer(n),
            as.integer(p),
            zz,
            as.integer(aa$diagonal),
            as.double(aa$b0),
            as.double(aa$th),
            aa$bp,
            aa$bn,
            phat=rep(0, n),
            DUP=FALSE, PACKAGE="hierNet")
  out$phat
}

ggdescent.c <- function(x, xnum, zz, y, lam.l1, lam.l2, diagonal, rho, V, stepsize, backtrack=0.2, maxiter=100,
                             tol=1e-5, aa=NULL, trace=1) {
  # See ADMM4 pdf for the problem this solves.
  # 
  # x, xnum, zz, y: data (note: zz is a length n*cp2 vector, not a matrix) xnum is x as a vector
  # lam.l1: l1-penalty parameter
  # lam.l2: l2-penalty parameter
  # rho: admm parameter
  # V: see ADMM4 pdf
  # stepsize: step size to start backtracking with
  # backtrack: factor by which step is reduced on each backtrack.
  # maxiter: number of generalized gradient steps to take.
  # tol: stop gg descent if change in objective is below tol.
  # aa: initial estimate of (th, bp, bn)
  # trace: how verbose to be
  #
  # void ggdescent_R(double *x, int *n, int *p, double *zz, int *diagonal, double *y, 
  #		     double *lamL1, double*lamL2, double *rho, double *V, int *maxiter, 
  #		     double *curth, double *curbp, double *curbn,
  #		     double *t, int *stepwindow, double *backtrack, double *tol, int *trace,
  #		     double *th, double *bp, double *bn) {
  n <- length(y)
  p <- ncol(x)
  stepwindow <- 10
  if (is.null(aa)) aa <- list(th=matrix(0,p,p), bp=rep(0,p), bn=rep(0,p))
  out <- .C("ggdescent_R",
            xnum,
            as.integer(n),
            as.integer(p),
            zz,
            as.integer(diagonal),
            y,
            as.double(lam.l1),
	    as.double(lam.l2),
            as.double(rho),
            as.double(V),
            as.integer(maxiter),
            as.double(aa$th),
            aa$bp,
            aa$bn,
            stepsize,
            as.integer(stepwindow),
            backtrack,
            tol,
            as.integer(trace),
            th=rep(0, p*p),
            bp=rep(0, p),
            bn=rep(0, p),
            DUP=FALSE, PACKAGE="hierNet")
  list(bp=out$bp, bn=out$bn, th=matrix(out$th, p, p))
}


hierNet.logistic <- function(x, y, lam, delta=1e-8, diagonal=TRUE, strong=FALSE, aa=NULL, zz=NULL, center=TRUE,
                             stand.main=TRUE, stand.int=FALSE,
                             rho=nrow(x), niter=100, sym.eps=1e-3,# ADMM params
                             step=1, maxiter=2000, backtrack=0.2, tol=1e-5, # GG descent params
                             trace=1) {
  # Solves the logistic regression hiernet. Returns (b0, bp, bn, th)
  this.call <- match.call()
  n <- nrow(x)
  p <- ncol(x)
  stopifnot(y %in% c(0,1))
  stopifnot(length(y) == n, lam >= 0, delta >= 0, delta <= 1)
  stopifnot(!is.null(step) && !is.null(maxiter))
  stopifnot(class(lam) == "numeric")
  stopifnot(class(delta) == "numeric")
  stopifnot(class(step) == "numeric", step > 0, maxiter > 0)
  stopifnot(is.finite(x), is.finite(y), is.finite(lam), is.finite(delta))
  lam.l1 <- lam * (1 - delta)
  lam.l2 <- lam * delta
  if (!center) 
    cat("WARNING: center=FALSE should almost never be used.  This option is available for special uses only.", fill = TRUE)
  x <- scale(x, center = center, scale = stand.main)
  mx <- attr(x, "scaled:center")
  sx <- attr(x, "scaled:scale")
  if (is.null(aa)) aa <- list(b0=0, bp=rep(0, p), bn=rep(0, p), th=matrix(0, p, p), diagonal=diagonal)
  if (is.null(zz)) {
    if (trace > 0) cat("Computing zz...", fill=TRUE)
    zz <- compute.interactions.c(x, diagonal=diagonal)
  }
  if (is.matrix(zz)) {
    zz <- scale(zz, center=center, scale=stand.int)
    mzz <- attr(zz, "scaled:center")
    szz <- attr(zz, "scaled:scale")
    zz <- as.numeric(zz)
  } else {
    mzz <- szz <- NULL
    #cat("Provided zz is not a matrix, so it's assumed to be already centered.", fill = TRUE)
  }
  xnum <- as.numeric(x)
  if (strong) {
    # strong hierarchy -- use ADMM4 (logistic regression version)
    stopifnot(is.numeric(rho), is.finite(rho))
    out <- admm4.logistic(x, xnum, y, lam.l1, lam.l2, diagonal=diagonal, zz=zz,
                          rho=rho, niter=niter, aa=aa, sym.eps=sym.eps, # ADMM params
                          stepsize=step, backtrack=backtrack, maxiter=maxiter, tol=tol, # GG params
                          trace=trace)
    ii <- out$bp + out$bn == 0
    # note out$th[ii, ] = 0 since weak hierarchy holds for sure
    sumii <- sum(ii)
    if (sumii > 0 && sumii < p) {
      thr <- max(abs(out$th[!ii, ii]))
      if (thr > 0) {
        cat("  thr = ",thr, fill=TRUE)
        if (thr > 1e-3)
          warning("Had to change ADMM's 'th' by more than 0.001 to make strong hier hold! Increase niter (and/or rho). ")
        aa$th[abs(aa$th) <= thr] <- 0
      }
    }
  } else {
    out <- ggdescent.logistic(xnum=xnum, zz=zz, y=y, lam.l1=lam.l1, lam.l2=lam.l2, diagonal=diagonal, rho=0, V=matrix(0,p,p),
                                  stepsize=step, backtrack=backtrack, maxiter=maxiter,
                                  tol=tol, aa=aa, trace=trace)
  }
  out$call <- this.call
  out$lam <- lam
  out$delta <- delta
  out$type <- "logistic"
  out$diagonal <- diagonal
  out$strong <- strong
  if (strong) {
    # ADMM parameters:
    out$rho <- rho
    out$niter <- niter
    out$sym.eps <- sym.eps
  }
  out$step <- step
  out$maxiter <- maxiter
  out$backtrack <- backtrack
  out$tol <- tol
  out$obj <- critf.logistic(x, y, lam.l1, lam.l2, out$b0, out$bp, out$bn, out$th)
  out$mx <- mx
  out$my <- 0
  out$sx <- sx
  out$mzz <- mzz
  class(out) <- "hierNet"
  return(out)
}


admm4.logistic <- function(x, xnum, y, lam.l1, lam.l2, diagonal, zz=NULL, rho=10, niter, aa=NULL, sym.eps=1e-3, trace=1, ...) {
  # Performs ADMM4 for logistic loss.
  # Note: xnum is the matrix x as a numeric.  Both are passed to avoid having to call as.numeric too
  # many times.
  p <- ncol(x)
  if (is.null(zz)) {
    if (trace > 0) cat("Computing zz...", fill=TRUE)
    zz <- as.numeric(compute.interactions.c(x, diagonal=diagonal))
  }
  else if (class(zz) == "matrix") zz <- as.numeric(zz)
  if (is.null(aa)) {
    aa <- list(u=matrix(0, p, p),
               th=matrix(0, p, p),
               bp=rep(0, p),
               bn=rep(0, p),
               tt=matrix(0, p, p),
               diagonal=diagonal)
  }
  if (is.null(aa$tt) || is.null(aa$u)) {
    aa$tt <- 0.5 * (aa$th + t(aa$th))
    aa$u <- matrix(0, p, p)
  }
  obj <- Objective.logistic(aa=aa, x=x, y=y, lam.l1=lam.l1, lam.l2=lam.l2, xnum=xnum, zz=zz, strong=TRUE, sym.eps=sym.eps)
  for (i in seq(niter)) {
    if (trace > 0) cat(i, " ")
    V <- aa$u - rho * aa$tt
    gg <- ggdescent.logistic(xnum, zz, y, lam.l1=lam.l1, lam.l2=lam.l2, diagonal=diagonal, rho, V, trace=trace-1, aa=aa, ...)
    aa$th <- gg$th
    aa$bp <- gg$bp
    aa$bn <- gg$bn
    aa$tt <- (aa$th + t(aa$th)) / 2 + (aa$u + t(aa$u)) / (2 * rho)
    aa$u <- aa$u + rho * (aa$th - aa$tt)
    obj <- c(obj, Objective.logistic(aa=aa, x=x, y=y, lam.l1=lam.l1, lam.l2=lam.l2, xnum=xnum, zz=zz, strong=TRUE, sym.eps=sym.eps))
    if (trace > 0) cat(obj[i+1], fill=TRUE)
  }
  if (max(abs(aa$th-t(aa$th))) > sym.eps)
    cat("Attention: th not symmetric within the desired sym.eps.  Run ADMM for more iterations. And try increasing rho.")
  aa$obj <- obj
  aa
}



ggdescent.logistic <- function(xnum, zz, y, lam.l1, lam.l2, diagonal, rho, V, stepsize, backtrack=0.2, maxiter=100,
                             tol=1e-5, aa=NULL, trace=1) {
  # See ADMM4 pdf and logistic.pdf for the problem this solves.
  # 
  # xnum, zz, y: data (note: zz is a length n*cp2 vector, not a matrix) xnum is x as a (n*p)-vector
  # lam.l1: l1-penalty parameter
  # lam.l2: l2-penalty parameter
  # rho: admm parameter
  # V: see ADMM4 pdf
  # stepsize: step size to start backtracking with
  # backtrack: factor by which step is reduced on each backtrack.
  # maxiter: number of generalized gradient steps to take.
  # tol: stop gg descent if change in objective is below tol.
  # aa: initial estimate of (b0, th, bp, bn)
  # trace: how verbose to be
  #
  #void ggdescent_logistic_R(double *x, int *n, int *p, double *zz, int * diagonal, double *y, 
  #			     double *lamL1, double *lamL2, double *rho, double *V, int *maxiter, 
  #			     double *curb0, double *curth, double *curbp, double *curbn,
  #			     double *t, int *stepwindow, double *backtrack, double *tol, int *trace,
  #			     double *b0, double *th, double *bp, double *bn) {
  n <- length(y)
  p <- length(xnum) / n
  stopifnot(p == round(p))
  if (diagonal) stopifnot(length(zz) == n * (choose(p,2)+p))
  else stopifnot(length(zz) == n * choose(p,2))
  stepwindow <- 10
  if (is.null(aa)) aa <- list(b0=0, th=matrix(0,p,p), bp=rep(0,p), bn=rep(0,p))
  out <- .C("ggdescent_logistic_R",
            xnum,
            as.integer(n),
            as.integer(p),
            zz,
            as.integer(diagonal),
            as.double(y), # convert from integer to double
            as.double(lam.l1),
            as.double(lam.l2),
            as.double(rho),
            as.double(V),
            as.integer(maxiter),
            as.double(aa$b0),
            as.double(aa$th),
            aa$bp,
            aa$bn,
            as.double(stepsize),
            as.integer(stepwindow),
            as.double(backtrack),
            as.double(tol),
            as.integer(trace),
            b0=as.double(0),
            th=rep(0, p*p),
            bp=rep(0, p),
            bn=rep(0, p),
            DUP=FALSE, PACKAGE="hierNet")
  list(b0=out$b0, bp=out$bp, bn=out$bn, th=matrix(out$th, p, p))
}


ADMM4.Lagrangian <- function(aa, xnum, zz, y, lam.l1, lam.l2, diagonal, rho) {
  # aa: list with (th, bp, bn, tt, u)
  # zz is a vector not a matrix
  if (aa$diagonal == FALSE)
    if (any(abs(diag(aa$th)) > 1e-8)) {
      cat("Diagonal of Theta is nonzero.", fill=TRUE)
      return(Inf)
    }
  if (max(aa$tt-t(aa$tt)) > 1e-8) {
    cat("Theta is not symmetric.", fill=TRUE)
    return(Inf)
  }
  if (any(rowSums(abs(aa$th)) > aa$bp + aa$bn + 1e-5)) {
    cat("hierarchy violated.", fill=TRUE)
    return(Inf)
  }
  if (any(aa$bp < -1e-5)||any(aa$bn < -1e-5)) {
    cat("Non-negative of bp or bn violated.", fill=TRUE)
    return(Inf)
  }
  if (diagonal == FALSE)
    if (any(abs(diag(aa$th)) > 1e-5)) {
      cat("Zero diagonal violated.", fill=TRUE)
      return(Inf)
    }

  V <- aa$u - rho * aa$tt

  r <- y - Compute.yhat.c(xnum, zz, aa)
  admm <- sum(aa$u*(aa$th-aa$tt)) + (rho/2) * sum((aa$th-aa$tt)^2)
  #admm <- sum(V*aa$th) + (rho/2) * sum(aa$th^2) + (rho/2)*sum(aa$tt^2) - sum(aa$u*aa$tt)
  pen <- lam.l1 * (sum(aa$bp + aa$bn) + sum(abs(aa$th))/2)
  pen <- pen + lam.l2 * (sum(aa$bp^2) + sum(aa$bn^2) + sum(aa$th^2))
  sum(r^2)/2 + pen + admm
}


predict.hierNet.logistic <- function(object, newx, newzz=NULL, ...) {
  predict.hierNet(object, newx, newzz, ...)
}

critf.logistic <- function(x, y, lam.l1, lam.l2, b0, bp, bn, th) {
  yhat <- b0 + x %*% (bp - bn) + 0.5 * diag(x %*% th %*% t(x))
  p <- 1 / (1 + exp(-yhat))
  val <- -sum(y * log(p) + (1 - y) * log(1 - p)) 
  val <- val + lam.l1 * sum(bp + bn) + lam.l1 * sum(abs(th))/2 + lam.l1 * sum(abs(diag(th)))/2
  val <- val + lam.l2 * (sum(bp^2) + sum(bn^2) + sum(th^2))
  return(val)
}

twonorm <- function(x) {sqrt(sum(x * x))}

hierNet.logistic.path <- function (x, y, lamlist=NULL, delta=1e-8, minlam=NULL, maxlam=NULL, flmin=.01, nlam=20, 
                                   diagonal=TRUE, strong=FALSE, aa=NULL, 
                                   zz=NULL, stand.main=TRUE, stand.int=FALSE,
                                   rho=nrow(x), niter=100, sym.eps=1e-3,# ADMM params
                                   step=1, maxiter=2000, backtrack=0.2, tol=1e-5, # GG params
                                   trace=0) {
  this.call=match.call()
  stopifnot(y %in% c(0, 1))
  x <- scale(x, center=TRUE, scale=stand.main)
  mx <- attr(x, "scaled:center")
  sx <- attr(x, "scaled:scale")
  
  if (is.null(maxlam)) {
    if (!is.null(minlam)) stop("Cannot have maxlam=NULL if minlam is non-null.")
    maxlam <- max(abs(t(x) %*% y))
    minlam <- maxlam * flmin
  }
  if (is.null(minlam)) minlam <- maxlam * flmin
  if (is.null(lamlist))
    lamlist <- exp(seq(log(maxlam), log(minlam), length=nlam))
  nlam <- length(lamlist)
  
  if (is.null(zz)) 
    zz <- compute.interactions.c(x, diagonal=diagonal)
  else stopifnot(is.matrix(zz))
  zz <- scale(zz, center=TRUE, scale=stand.int)
  mzz <- attr(zz, "scaled:center")
  szz <- attr(zz, "scaled:scale")
  zz <- as.numeric(zz)
  p <- ncol(x)
  cp2 <- choose(p, 2)
  b0 <- rep(NA, nlam)
  bp <- bn <- matrix(NA, nrow=p, ncol=nlam)
  th <- array(NA, c(p, p, nlam))
  obj <- rep(NA, nlam)
  aa <- NULL
  for (i in seq(nlam)) {
    cat(c("i,lam=", i, round(lamlist[i],2)), fill=TRUE)
    aa <- hierNet.logistic(x, y, lam=lamlist[i], delta=delta, diagonal=diagonal, strong=strong, 
                           aa=aa, zz=zz, stand.main=FALSE, stand.int=FALSE,
                           rho=rho, niter=niter, sym.eps=sym.eps,
                           step=step, maxiter=maxiter, backtrack=backtrack, tol=tol, 
                           trace=trace)
    b0[i] <- aa$b0
    bp[, i] <- aa$bp
    bn[, i] <- aa$bn
    th[, , i] <- aa$th
    obj[i] <- aa$obj
  }
  dimnames(bp) <- dimnames(bn) <- list(as.character(1:p), NULL)
  dimnames(th) <- list(as.character(1:p), as.character(1:p), NULL)
  out <- list(b0=b0, bp=bp, bn=bn, th=th, obj=obj, lamlist=lamlist, delta=delta,
              mx=mx, my=0, sx=sx, mzz=mzz, szz=szz,
              type="logistic", diagonal=diagonal, strong=strong,
              step=step, maxiter=maxiter, backtrack=backtrack, tol=tol,
              call=this.call)   
  if (strong) {
    # ADMM parameters:
    out$rho <- aa$rho
    out$niter <- niter
    out$sym.eps <- sym.eps
  }
  class(out) <- "hierNet.path"
  out
}

balanced.folds <- function(y, nfolds=min(min(table(y)), 10)) {
  totals <- table(y)
  fmax <- max(totals)
  nfolds <- min(nfolds, fmax)
  # makes no sense to have more folds than the max class size
  folds <- as.list(seq(nfolds))
  yids <- split(seq(y), y)
  # nice we to get the ids in a list, split by class
  ###Make a big matrix, with enough rows to get in all the folds per class
  bigmat <- matrix(NA, ceiling(fmax/nfolds) * nfolds, length(totals))
  for(i in seq(totals)) {
    bigmat[seq(totals[i]), i] <- sample(yids[[i]])
  }
  smallmat <- matrix(bigmat, nrow = nfolds)       # reshape the matrix
  ### Now do a clever sort to mix up the NAs
  smallmat <- permute.rows(t(smallmat))   ### Now a clever unlisting
  # the "clever" unlist doesn't work when there are no N
  #       apply(smallmat, 2, function(x)
  #        x[!is.na(x)])
  res <-vector("list", nfolds)
  for(j in 1:nfolds) {
    jj <- !is.na(smallmat[, j])
    res[[j]] <- smallmat[jj, j]
  }
  return(res)
}

permute.rows <-function(x) {
  dd <- dim(x)
  n <- dd[1]
  p <- dd[2]
  mm <- runif(length(x)) + rep(seq(n) * 10, rep(p, n))
  matrix(t(x)[order(mm)], n, p, byrow = TRUE)
}

hierNet.cv <- function(fit, x, y, nfolds=10, folds=NULL, trace=0) {
  this.call <- match.call()
  stopifnot(class(fit) == "hierNet.path")
  if(fit$type=="gaussian"){errfun=function(y,yhat){(y-yhat)^2}} 
  if(fit$type=="logistic"){errfun=function(y,yhat){1*(y!=yhat)}} 
  n <- length(y)
  if(is.null(folds)) {
    folds <- split(sample(1:n), rep(1:nfolds, length = n))
  }
  else {
    stopifnot(class(folds)=="list")
    nfolds <- length(folds)
  } 
  lamlist=fit$lamlist

  # get whether fit was standardized based on fit$sx and fit$szz...
  if (is.null(fit$mx)) stop("hierNet object was not centered.  hierNet.cv has not been written for this (unusual) case.")
  stand.main <- !is.null(fit$sx)
  stand.int <- !is.null(fit$szz)
  
  n.lamlist <- length(lamlist)        ### Set up the data structures
  size <- double(n.lamlist)
  err2=matrix(NA,nrow=nfolds,ncol=length(lamlist))
  for(ii in 1:nfolds) {
    cat("Fold", ii, ":")
    if(fit$type=="gaussian"){
      a <- hierNet.path(x[-folds[[ii]],],y=y[-folds[[ii]]], 
                        lamlist=lamlist, delta=fit$delta, diagonal=fit$diagonal, strong=fit$strong, trace=trace,
                        stand.main=stand.main, stand.int=stand.int,
                        rho=fit$rho, niter=fit$niter, sym.eps=fit$sym.eps, # ADMM parameters (which will be NULL if strong=F)
                        step=fit$step, maxiter=fit$maxiter, backtrack=fit$backtrack, tol=fit$tol) # GG descent params
      
      yhatt=predict.hierNet(a,newx=x[folds[[ii]],])
    }
    if(fit$type=="logistic"){
      a <- hierNet.logistic.path(x[-folds[[ii]],],y=y[-folds[[ii]]], 
                                 lamlist=lamlist, delta=fit$delta, diagonal=fit$diagonal, strong=fit$strong,
                                 trace=trace, stand.main=stand.main, stand.int=stand.int,
                                 rho=fit$rho, niter=fit$niter, sym.eps=fit$sym.eps, # ADMM parameters (which will be NULL if strong=F)
                                 step=fit$step, maxiter=fit$maxiter, backtrack=fit$backtrack, tol=fit$tol) # GG descent params                                 
      yhatt=predict.hierNet.logistic(a,newx=x[folds[[ii]],])$yhat
    }
    
    temp=matrix(y[folds[[ii]]],nrow=length(folds[[ii]]),ncol=n.lamlist)
    err2[ii,]=colMeans(errfun(yhatt,temp))
    cat("\n")
  }
  errm=colMeans(err2)
  errse=sqrt(apply(err2,2,var)/nfolds)
  o=which.min(errm)
  lamhat=lamlist[o]
  oo=errm<= errm[o]+errse[o]
  lamhat.1se=lamlist[oo & lamlist>=lamhat][1]
  
  nonzero=colSums(fit$bp-fit$bn!=0) + apply(fit$th!=0, 3, function(a) sum(diag(a)) + sum((a+t(a)!=0)[upper.tri(a)]))
  obj <- list(lamlist=lamlist, cv.err=errm,cv.se=errse,lamhat=lamhat, lamhat.1se=lamhat.1se,
             nonzero=nonzero, folds=folds,
             call = this.call)
  class(obj) <- "hierNet.cv"
  obj
}

plot.hierNet.cv <- function(x, ...) {
  par(mar = c(5, 5, 5, 1))
  yrang=range(c(x$cv.err-x$cv.se,x$cv.err+x$cv.se))
  plot(log(x$lamlist), x$cv.err, xlab="log(lambda)",
       ylab = "Cross-validation Error", type="n",ylim=yrang)
  axis(3, at = log(x$lamlist), labels = paste(x$nonzero), srt = 90, adj = 0)
  mtext("Number of features", 3, 4, cex = 1.2)
  axis(2, at = c(0, 0.2, 0.4, 0.6, 0.8))
  error.bars(log(x$lamlist), x$cv.err - x$cv.se, x$cv.err + x$cv.se, width = 0.01, col = "darkgrey")
  points(log(x$lamlist), x$cv.err, col=2, pch=19)
  abline(v=log(x$lamhat), lty=3)
  abline(v=log(x$lamhat.1se), lty=3)
  invisible()
}

error.bars <-function(x, upper, lower, width = 0.02, ...) {
  xlim <- range(x)
  barw <- diff(xlim) * width
  segments(x, upper, x, lower, ...)
  segments(x - barw, upper, x + barw, upper, ...)
  segments(x - barw, lower, x + barw, lower, ...)
  range(upper, lower)
}

hierNet.varimp <- function(fit,x,y, ...) {
  # NOTE: uses 0.5 cutoff for logistic case
  lam=fit$lam
  if(fit$type=="gaussian"){errfun=function(y,yhat){(y-yhat)^2}}
  if(fit$type=="logistic"){
    errfun=function(y,yhat){
      term1=y*log(yhat);term1[yhat==0]=0
      term2=(1-y)*log(1-yhat);term2[yhat==1]=0
      val=-sum(term1+term2)
      return(val)
    }}
  yhat=predict(fit,x)
  rss=sum(errfun(y,yhat))
  varsum=fit$bp-fit$bn+rowSums(abs(fit$th))
  oo=which(abs(varsum)>1e-6)
  imp=rss2=rep(NA,ncol(x))
  for(j in oo){
    cat(j)
    fit0=fit;fit0$bp=fit$bp[-j];fit0$bn=fit$bn[-j];fit0$th=fit$th[-j,-j]
    if(fit$type=="gaussian"){ fit2=hierNet(x[,-j],y,lam,delta=fit$delta,diagonal=fit$diagonal,aa=fit0)}
    if(fit$type=="logistic"){ fit2=hierNet.logistic(x[,-j],y,lam,delta=fit$delta,diagonal=fit$diagonal,aa=fit0)}
    yhat2=predict(fit2,x[,-j])
    rss2[j]=sum(errfun(y,yhat2))
    imp[j]=(rss2[j]-rss)/rss2[j]
  }
  imp[-oo]=0
  res=cbind(1:ncol(x),round(imp,3))
  ooo=order(-imp)
  dimnames(res)=list(NULL,c("Predictor","Importance"))
  cat("",fill=T)
  return(res[ooo,])
}
pejovic/hierNet documentation built on May 25, 2019, 12:45 a.m.