WAPL: What the Package Does (One Line, Title Case)

Documented in predict.linearcl predict.qlearn predict.rbfcl wsvm

#' Qlearning_gglasso
#' This is the main function of Q-learning or penalized Q-learning
#' @param H: n by p matrix of features
#' @param A: treatment vector of length n, A_i = +1 or -1
#' @param R: outcome vector of length n
#' @param pA: propensity score, vector of length n
#' @param pentype: whether penalized Q-learning is used or not, default is "lasso"
#' @param group: group number, vector of length (2p+1), have to be consective, in each individual is one group then set group=seq(1: (2p+1))
#' @param loss: default is "ls" for least square loss
#' @import gglasso
#' @return  subject of class "qlearn"
#' @export
#' @examples
#' Qlearning_gglasso(H,A,R, pA)

Qlearning_gglasso <- function (H, A, R, pA, pentype = "lasso", group, loss="ls", m = 10) {
  gcinfo(F)
  n = length(A)
  X = cbind(H, A, diag(A) %*% H)
  colnames(X) <- c(paste("H",1:dim(H)[2], sep = ""),"A",paste("AH",1:dim(H)[2], sep = ""))

  if (pentype == "lasso") {
    cvfit <- cv.GLglmnetN(X, R, Yorig = R, A = A, pA = pA, nfolds = m,  group=group, loss = loss)
    co <- coef(cvfit, s=cvfit$lambda.maxVd)
  } else {
    co = coef(lm(R ~ X))
  }

  XX1 = cbind(rep(1, n), H, rep(1, n), diag(n) %*% H)
  XX2 = cbind(rep(1, n), H, rep(-1, n), -diag(n) %*% H)
  Q1 = XX1 %*% co
  Q2 = XX2 %*% co
  Q = apply(cbind(Q1, Q2), 1, max)
  Qsingle = list(co = co, Q = Q)
  class(Qsingle) = "qlearn"
  Qsingle
}


#' Cross validation for L1-Penalized Q-Leanring
#' Given group information, the variable selection can be L1-lasso or group-lasso
#' @param x: n by p matrix of features
#' @param y: weights vector of length n
#' @param Yorig: original outcome vector of length n
#' @param A: treatment vector of length n
#' @param pA: propensity score, vector of length n
#' @param pentype: whether penalized Q-learning is used or not, default is "lasso"
#' @param group: group number, vector of length (2p+1), have to be consective, in each individual is one group then set group=seq(1: (2p+1))
#' @param loss: default is "ls" for least square loss
#' @param pA: propensity score, vector of length n
#' @param nfolds: number of cross validation fold, should be an integer >3
#' @import gglasso
#' @return  subject of class "qlearn"
cv.GLglmnetN <- function (x, y, Yorig = NULL, A, group=c(rep(1:10, each=5), 11, rep(12:21, each=5)),
                          pA, nfolds = 10, loss="ls", ...) {
  gcinfo(F)
  if(is.null(Yorig)) Yorig <- y

  N = nrow(x)
  y = drop(y)

  GLglmnet.call = match.call(expand.dots = TRUE)
  GLglmnet.object = gglasso(x , y , group=group, loss=loss, ...)
  GLglmnet.call[[1]] = as.name("gglasso")

  foldid = sample(rep(seq(nfolds), length = N))
  if (nfolds < 3)
    stop("nfolds must be bigger than 3; nfolds=10 recommended")
  outlist = as.list(seq(nfolds))
  for (i in seq(nfolds)) {
    which = foldid == i
    if (is.matrix(y)) {
      y_sub = y[!which, ]} else {
        y_sub = y[!which]
      }
    outlist[[i]] = gglasso(x[!which, , drop = FALSE], y_sub , group=group, loss=loss)
  }

  lambda = GLglmnet.object$lambda
  Vd_cvstuff = do.call(vd_fun, list(outlist, lambda, x, Yorig, A, pA, foldid,group, loss))

  Vdraw <- Vd_cvstuff$Vdraw
  Vdm <- Vd_cvstuff$Vdm
  Vdsd <- Vd_cvstuff$Vdsd
  lambda.maxVd <- Vd_cvstuff$lambda.maxVd

  rm(x, y, outlist)
  out = list(lambda = lambda,  Vdraw = Vdraw, Vdm = Vdm, Vdsd = Vdsd,
             lambda.maxVd = lambda.maxVd, gglasso.fit = GLglmnet.object)

  class(out) = "cv.gglasso"
  out
}


#' ITR value estimated given a list of fitted model
#' Calculate the value function given each lambda
#' @param outlist: A list of fitted model, corresponding to lambda
#' @param lambda: lambda vector chosen automatically when whole data if fitted
#' @param x: feature matrix
#' @param y: outcome/residual
#' @param A: treatment observed
#' @param foldid: length n vector of cv group index, for Cross validation purpose purpose
#' @param group: group number, vector of length (2p+1), have to be consective, in each individual is one group then set group=seq(1: (2p+1))
#' @param loss: default is "ls" for least square loss
#' @import gglasso

vd_fun <- function (outlist, lambda, x, y, A,  foldid, group, loss) {
  y=drop(y)
  gcinfo(F)
  mlami = max(sapply(outlist, function(obj) min(obj$lambda)))
  which_lam = lambda >= mlami
  lckymat = matrix(NA, length(y), length(lambda))
  nfolds = max(foldid)
  nlams = double(nfolds)

  for (i in seq(nfolds)) {
    which = foldid == i
    fitobj = outlist[[i]]
    cont_cols <- which(substr(colnames(x),1,1) == "A")
    cont_coef <- coef(fitobj, s = lambda[which_lam])[(1+cont_cols),]
    #each column gives predicted contrast for one lambda val
    trt_rec <- A[which]
    # multiply by trt_rec to obtain sign of original predictors/2
    contr <- cbind(1, x[which, cont_cols[-1], drop = FALSE]*trt_rec)%*%cont_coef
    # each column gives predicted optimal treatment for one lambda val
    opt_trt <- sign(contr)
    # indicators of whether subject received estimated optimal treatment
    lcky <- as.matrix((trt_rec - opt_trt) == 0)*1
    nlami = sum(which_lam)
    # estimator of 1_{A=d}/p(A|X) for each lambda val
    lckymat[which, seq(nlami)] <- lcky
    nlams[i] = nlami
  }

  Vdraw <- NULL
  N <- length(y) - apply(is.na(lckymat), 2, sum)
  Vdm <- apply(lckymat*y/pA, 2, sum) / apply(lckymat/pA, 2, sum)
  Vdsd <- "not_computed"

  # implement Qian's multi-stage approach for selecting lambda
  maxVdm <- max(Vdm[!is.na(Vdm)])
  lambda.maxVdID <- which(Vdm == maxVdm)

  if(length(lambda.maxVdID) == 0) {
    Vdm <- rep(-10^30, length(lambda))
    Vdsd <- rep(0, length(lambda))
    Vdraw <- NA
    lambda.maxVd <- NA
  } else {
    if(length(lambda.maxVdID) == 1) {
      lambda.maxVd <- lambda[lambda.maxVdID]
    } else {
      red_lambda_set <- rev(sort(lambda[lambda.maxVdID]))
      red_lambda_fits <- gglasso(x = x, y = y, group=group, loss=loss, ...)
      lmbdaidx= sapply(red_lambda_set, function(j) which(red_lambda_fits$lambda==j))
      nzeros <- apply(red_lambda_fits$beta[,lmbdaidx] == 0, 2, sum)
      max_nzeros <- max(nzeros)
      lambda.maxnzerosID <- which(nzeros == max_nzeros)
      if(length(lambda.maxnzerosID) == 1) {
        lambda.maxVd <- red_lambda_set[lambda.maxnzerosID]
        #co=red_lambda_fits$beta[, which( red_lambda_set = lambda.maxVd)]
      } else {
        red_lambda_set2 <- red_lambda_set[lambda.maxnzerosID]
        cv.red_lambda_fits2 <- cv.gglasso(x = x, y = y, lambda = red_lambda_set2,
                                          pred.loss="L1", group=group,...)
        lambda.maxVd <-cv.red_lambda_fits2$lambda.min
        #co=coef(cv.red_lambda_fits2 , s=cv.red_lambda_fits2$lambda.min )
      }
    }
  }
  rm( fitobj, outlist, lambda)
  out = list(Vdm = Vdm, Vdsd = Vdsd, Vdraw = Vdraw, lambda.maxVd = lambda.maxVd)
  out
}


#' predict function for class "qlearn"
#' This is the prediction function for the model fitted using Qlearning_gglasso
#' @param qlrn: model returned from Qlearning_gglasso
#' @param X: feature matrix of the new subject
predict.qlearn <- function(qlrn, X) {
  gcinfo(F)
  nn <- dim(X)[1]
  XX1 = cbind(rep(1, nn), X, rep(1, nn), diag(nn) %*% X)
  XX2 = cbind(rep(1, nn), X, rep(-1, nn), -diag(nn) %*% X)
  Q1 = XX1 %*% qlrn$co
  Q2 = XX2 %*% qlrn$co
  pred <- as.numeric(Q1 - Q2)
  #w_hat <- abs(pred)
  z_hat <- I(pred > 0)*1
  opt_trt <- 2*z_hat - 1
  rm(XX1, XX2, Q1, Q2,z_hat)
  out <- list(pred = pred, opt_trt = opt_trt, coef.est = as.numeric(qlrn$co))
  return(out)
}

#' ROWL, a function for OWL and RWL.
#' This is used for benchmark models
#' @param H: n by P feature matrix
#' @param A: treatment, takes value -1 and +1 with size n
#' @param R2: outcome or residual vector with length n
#' @param pi: propensity score with length n
#' @param pentype: penalty in the residual estimation process, useful or RWL only
#' @param kernel: kernel used in SVM
#' @param residual: True or False, if True then RWL is deployed, if FALSE then OWL is deployed. Default is True.
#' @param sigma: hyper-parameter in SVM rbf kernel
#' @param clinear: hyper-parameter in SVM rbf kernel
#' @param m: number of fold for cross validation in parameter tunning
#' @param e: least tolerated error
#' @import e1071
#' @import kernlab
#' @export
#' @examples
#' ROWL()

ROWL<-function (H, A, R2, pi = rep(1, n), pentype = "lasso", kernel = "linear", residual=TRUE,
                sigma = c(0.03, 0.05, 0.07), clinear = 2^(-2:2), m = 4, e = 1e-05)
{
  npar = length(clinear)
  n = length(A)
  p = dim(H)[2]
  if (residual==TRUE & max(R2) != min(R2)) {
    if (pentype == "lasso") {
      cvfit = cv.glmnet(H, R2, nfolds = m)
      co = as.matrix(predict(cvfit, s = "lambda.min", type = "coeff"))
    }
    else if (pentype == "LSE") {
      co = coef(lm(R2 ~ H))
    }
    else stop(gettextf("'pentype' is the penalization type for the regression step of Olearning, the default is 'lasso',\nit can also be 'LSE' without penalization"))
    r = R2 - cbind(rep(1, n), H) %*% co
  }
  else r = R2
  rand = sample(m, n, replace = TRUE)
  r = r/pi
  if (kernel == "linear") {
    V = matrix(0, m, npar)
    for (i in 1:m) {
      this = (rand != i)
      X = H[this, ]
      Y = A[this]
      R = r[this]
      Xt = H[!this, ]
      Yt = A[!this]
      Rt = r[!this]
      for (j in 1:npar) {
        model = wsvm(X, Y, R, C = clinear[j], e = e)
        YP = predict(model, Xt)
        V[i, j] = sum(Rt * (YP == Yt))/sum(YP == Yt)
      }
    }
    mimi = colMeans(V)
    best = which.max(mimi)
    cbest = clinear[best]
    model = wsvm(H, A, r, C = cbest, e = e)
  }
  if (kernel == "rbf") {
    nsig = length(sigma)
    V = array(0, c(npar, nsig, m))
    for (i in 1:m) {
      this = (rand != i)
      X = H[this, ]
      Y = A[this]
      R = r[this]
      Xt = H[!this, ]
      Yt = A[!this]
      Rt = r[!this]
      for (j in 1:npar) {
        for (s in 1:nsig) {
          model = wsvm(X, Y, R, "rbf", sigma = sigma[s],
                       C = clinear[j], e = e)
          YP = predict(model, Xt)
          V[j, s, i] = sum(Rt * (YP == Yt))/sum(YP ==
                                                  Yt)
        }
      }
    }
    mimi = apply(V, c(1, 2), mean)
    best = which(mimi == max(mimi), arr.ind = TRUE)
    bestC = clinear[best[1]]
    bestSig = sigma[best[2]]
    print(bestC)
    print(bestSig)
    model = wsvm(H, A, r, "rbf", bestSig, C = bestC, e = e)
  }
  model
}


#' wsvm to solve the subject-weighted SVM
#' @param X: n by p feature matrix
#' @param A: label
#' @param wR: instance weight
#' @param kernel: propensity score with length n, can be either "rbf" or "linear", default is "linear"
#' @param sigma: hyper-parameter in SVM rbf kernel
#' @param C: hyper-parameter in SVM rbf kernel
#' @param e: least tolerated error
#' @return object of rbfcl or linearcf classs
#' wsvm()
wsvm<-function(X, A, wR, kernel='linear',sigma=0.05,C=1,e=0.00001){
  wAR=A*wR
  if (kernel=='linear'){
    K=X%*%t(X)
  }
  else if (kernel=='rbf'){
    rbf=rbfdot(sigma=sigma)
    K=kernelMatrix(rbf,X)
  } else stop(gettextf("Kernel function should be 'linear' or 'rbf'"))

  H=K*(wAR%*%t(wAR))
  n=length(A)
  solution=ipop(-abs(wR),H,t(A*wR),0,numeric(n),rep(C,n),0,maxiter=100)
  alpha=primal(solution)
  alpha1=alpha*wR*A
  if (kernel=='linear'){
    w=t(X)%*%alpha1 #parameter for linear
    fitted=X%*%w
    rm=sign(wR)*A-fitted
  } else if (kernel=='rbf'){
    #there is no coefficient estimates for gaussian kernel
    #but there is fitted value, first we compute the fitted value without adjusting for bias
    fitted=K%*%alpha1
    rm=sign(wR)*A-fitted
  }
  Imid =(alpha < C-e) & (alpha > e)
  rmid=rm[Imid==1];
  if (sum(Imid)>0){
    bias=mean(rmid)
  } else {
    Iup=((alpha<e)&(A==-sign(wR)))|((alpha>C-e)&(A==sign(wR)))
    Ilow=((alpha<e)&(A==sign(wR)))|((alpha>C-e)&(A==-sign(wR)))
    rup=rm[Iup]
    rlow=rm[Ilow]
    bias=(min(rup)+max(rlow))/2}
  fit=bias+fitted
  if (kernel=='linear') {
    model=list(alpha1=alpha1,bias=bias,fit=fit,beta=w)
    class(model)<-'linearcl'
  } else if (kernel=='rbf') {
    model=list(alpha1=alpha1,bias=bias,fit=fit,sigma=sigma,X=X)
    class(model)<-'rbfcl'}
  return (model)
}

#' predict.linearcl: predict labels for a given new instance when model is trained using linear kernel
#' @param object: wsvm output under linear kernel
#' @param x: n by p feature matrix of new subjects
#' @return output label with length n
#' predict.linearcl()
predict.linearcl<-function(object,x,...){
  predict=sign(object$bias+x%*%object$beta)
}

#' predict.rbfcl: predict labels for a given new instance when model is trained using linear kernel
#' @param object: wsvm output underl rbf kernel
#' @param x: n by p feature matrix of new subjects
#' @return output label with length n
predict.rbfcl<-function(object,x,...){
  rbf=rbfdot(sigma=object$sigma)
  n=dim(object$X)[1]
  if (is.matrix(x)) xm=dim(x)[1]
  else if (is.vector(x)) xm=1
  else stop('x must be vector or matrix')
  if (xm==1){ K <- apply(object$X,1,rbf,y=x)
  }else{   K<- matrix(0, xm, n)
  for (i in 1:xm) {K[i,]=apply(object$X,1,rbf,y=x[i,]) }}

  predict=sign(object$bias+K%*%object$alpha1)
}