BNRPxMCMC: Bayesian l1 Trend Filtering via Proximal MCMC

Documented in pbtf

#' Proximal Bayesian Trend Filtering
#' @param y normal observations (vector)
#' @param x predictor (vector); if null, will use 1:m by default
#' @param restrictions a collection of shape restrictions,
#' including 'increasing','decreasing','convex','concave'
#' @param lambda lambda parameter of Moreau envelope
#' @param k order of the desired piecewise polynomial; 1 linear, 2 quadratic, 3 cubic; k=0 or k>=4 not recommended
#' @param delta percentage of maximum step size to use, default is 1
#' @param N.burn number of MCMC iterations burned
#' @param N.sim number of MCMC samples returned
#' @param t number of theta sampling steps to carry out before sigma2, default is 5
#' @param MH_correct whether to use MH correction or not
#' @param alpha regularization parameter; if NULL, it is determined by cross validation heuristic
#' @param alpha_grid a grid of alpha candidates to select from; we recommend including 0
#' @param cv_rule 'min' or '1se'
#' @param K conduct K-fold-cross validation to find the optimal alpha
#' @param s the shape parameter for inverse gamma prior of sigma2
#' @param r the rate parameter for inverse gamma prior of sigma2
#' @examples
#' x=seq(0,4*pi,len=100)
#' y=x+sin(x)+rnorm(100)
#' out_pbtf <- pbtf(y,x,'increasing',k=2,alpha_grid=seq(0,10,len=1001))
#' plot(out_pbtf)
#' @export
pbtf <- function(y,x=NULL,restrictions=NULL,lambda=NULL,k=2,delta=1,N.burn=5e3,N.sim=1e4,t=5,
                   MH_correct=F,alpha=NULL,alpha_grid=NULL,cv_rule='0.2se',K=10,s=1e-3,r=1e-3){
  start_time <- Sys.time()
  if(is.null(x)){x <- 1:length(y)}
  m <- length(x)
  ord <- order(x)
  x_ordered <- x[ord]
  y_ordered <- y[ord]
  x_unique <- unique(x_ordered)
  n <- length(x_unique)
  y_mean <- rep(0,n)
  w <- rep(0,n)
  for(i in 1:length(x_unique)){
    w[i] <- length(x_ordered[x_ordered==x_unique[i]])
    y_mean[i] <- mean(y_ordered[x_ordered==x_unique[i]])
  }
  wimax <- max(w)
  D <- getD(k,n,x_unique)
  N <- N.burn + N.sim
  gurobimodel_prox <- gurobimodelinit(x_unique,rep(1,n),restrictions,k)

  #use cross-validation to find alpha
  if(is.null(alpha)|is.null(lambda)){
    if(!is.null(alpha)){
      stop('If alpha is given, lambda must be specified too!')
    }else{
      if(any(alpha_grid<0)){
        stop('Can\'t pass negative alpha values!')
      }
      if(is.null(alpha_grid)){
        stop('Either alpha or alpha_grid must be specified!')
      }
      foldid <- c(0,rep(seq(1,K,1),n-2)[seq(1,n-2,1)],0)
      cvall <- matrix(0,K,length(alpha_grid))
      for(i in 1:K){
        #the following cross validation code is modified from cv.trendfilter function in genlasso package
        cat(sprintf("Fold %i ... ",i),ifelse(i==K,'\n',''),sep='')
        otr <- which(foldid!=i)
        ntr <- length(otr)
        xtr <- x_unique[otr]
        ytr <- y_mean[otr]
        wtr <- w[otr]
        beta <- matrix(0,nrow=ntr,ncol=length(alpha_grid))
        gurobimodelcv <- gurobimodelinit(xtr,wtr,restrictions,k)
        for(j in 1:length(alpha_grid)){
          gurobimodelcv$obj <- c(-wtr*ytr,alpha_grid[j]*rep(1,ntr-k-1))
          result <- gurobi::gurobi(gurobimodelcv,params=list(OutputFlag=0))
          while(result$status!='OPTIMAL'){
            gurobimodelcv$obj <- c(-wtr*ytr,(alpha_grid[j]+rnorm(1,0,sd=1e-3))*rep(1,ntr-k-1))
            result <- gurobi::gurobi(gurobimodelcv,params=list(OutputFlag=0))
          }
          beta[,j] <- result$x[1:ntr]
        }
        ote <- which(foldid==i)
        xte <- x_unique[ote]
        wte <- w[ote]
        yte <- matrix(y_mean[ote],length(ote),length(alpha_grid))
        ilo <- which((seq(1,n,1)%in%(ote-1))[otr])
        ihi <- which((seq(1,n,1)%in%(ote+1))[otr])
        a <- (xte - xtr[ilo])/(xtr[ihi]-xtr[ilo])
        pred <- beta[ilo,]*(1-a) + beta[ihi,]*a
        cvall[i,] <- colMeans((yte-pred)^2*wte)
      }
      cverr <- colMeans(cvall)
      cvse <- apply(cvall,2,sd)/sqrt(K)
      i0 = which.min(cverr)
      alpha.min = alpha_grid[i0]
      alpha0.2se = max(alpha_grid[cverr<=cverr[i0]+0.2*cvse[i0]])
      alphastar <- ifelse(cv_rule=='min',alpha.min,alpha0.2se)
      gurobimodel_alphastar <- gurobimodelinit(x_unique,w,restrictions,k)
      gurobimodel_alphastar$obj <- c(-w*y_mean,alphastar*rep(1,n-k-1))
      result <- gurobi::gurobi(gurobimodel_alphastar,params=list(OutputFlag=0))
      while(result$status!='OPTIMAL'){
        gurobimodel_alphastar$obj <- c(-w*y_mean,(alphastar+rnorm(1,0,sd=1e-3))*rep(1,n-k-1))
        result <- gurobi::gurobi(gurobimodel_alphastar,params=list(OutputFlag=0))
      }
      fitted <- result$x[1:n]
      sigma2hat <- mean((y_ordered-rep(fitted,w))^2)
      alpha <- alphastar/sigma2hat
      cat("Optimal alpha: ",alpha,'\n',sep='')
      lambda <- sigma2hat*1e-2/wimax
      cat('lambda: ',lambda,'\n',sep='')
    }
  }

  #prepare the arrays for sample storage
  theta_samples <- matrix(0,nrow=N,ncol=n)
  sigma2_samples <- rep(0,N)

  #set initial values of theta,sigma2 and alpha
  theta <- theta_samples[1,] <- y_mean
  sigma2 <- sigma2_samples[1] <- ifelse(exists("sigma2hat"),sigma2hat,1)

  #prepare the gradient and proximal operator
  gradf <- function(theta,sigma2){w*(theta-y_mean)/sigma2}
  prox <- function(theta){
    gurobimodel_prox$obj <- c(-theta,lambda*alpha*rep(1,n-k-1))
    result <- gurobi::gurobi(gurobimodel_prox,params=list(OutputFlag=0))
    while(result$status!='OPTIMAL'){
      gurobimodel_prox$obj <- c(-theta,(lambda*alpha+rnorm(1,0,sd=1e-3))*rep(1,n-k-1))
      result <- gurobi::gurobi(gurobimodel_prox,params=list(OutputFlag=0))
    }
    return(result$x[1:n])
  }
  logpi <- function(theta,prox_proj_theta,sigma2){
    -norm(y_ordered-rep(theta,w),'2')^2/(2*sigma2)-alpha*Matrix::norm(D%*%prox_proj_theta,'1')-norm(theta-prox_proj_theta,'2')^2/(2*lambda)
  }

  pb <- progress::progress_bar$new(
    format = "[:bar] :percent eta: :eta",
    total = N-1, clear = T, width= 100)
  accept_number <- 0
  for(i in 1:(N-1)){
    if(!MH_correct){
      thetanew <- MYULA(theta,sigma2,wimax,lambda,gradf,prox,delta,t)
    }else{
      result <- MYULA(theta,sigma2,wimax,lambda,gradf,prox,delta,1,T,logpi)
      thetanew <- result$thetanew
      if(result$accept){
        accept_number <- accept_number+1
      }
    }
    theta <- theta_samples[i+1,] <- thetanew
    sigma2 <- sigma2_samples[i+1] <- invgamma::rinvgamma(1, m/2+s, norm(y_ordered-rep(theta,w),'2')^2/2+r)
    pb$tick()
    if(i==(N.burn-1)){
      middle_time <- Sys.time()
    }
  }
  if(MH_correct){
    cat(sprintf("%s%% of theta MCMC iterates accepted (%s total iterates)",100*accept_number/N,N))
  }
  end_time <- Sys.time()
  scpu <- end_time - middle_time
  tcpu <- end_time - start_time
  out <- list()
  out$x <- x
  out$y <- y
  out$x_unique <- x_unique
  out$lambda <- lambda
  out$alpha <- alpha
  out$restrictions <- restrictions
  out$k <- k
  out$scpu <- as.numeric(scpu,units='secs')
  out$tcpu <- as.numeric(tcpu,units='secs')
  out$theta_samples <- theta_samples[(N.burn+1):N,]
  out$sigma2_samples <- sigma2_samples[(N.burn+1):N]
  out$cverr <- cverr
  attr(out,"class") <- 'pbtf'
  return(out)
}

#' @export
predict.pbtf <- function(obj){apply(obj$theta_samples,2,median)}

#' @export
plot.pbtf <- function(obj,pt.pch=1,pt.cex=1,pt.col='gray70',band.col='lightblue',
                      trend.col='blue',trend.lwd=2,main='',xlab='',ylab='',...){
  x <- obj$x
  y <- obj$y
  x_unique <- obj$x_unique
  fitted <- apply(obj$theta_samples,2,median)
  lcl <- apply(obj$theta_samples,2,quantile,probs=0.025)
  ucl <- apply(obj$theta_samples,2,quantile,probs=0.975)
  plot(x,y,col=pt.col,cex=pt.cex,main=main,xlab=xlab,ylab=ylab,type='n',...)
  polygon(c(x_unique,rev(x_unique)),c(lcl,rev(ucl)),col=band.col,border=NA)
  points(x,y,col=pt.col,cex=pt.cex,pch=pt.pch)
  lines(x_unique,fitted,col=trend.col,lwd=trend.lwd)
}