R/StatModels.R

Defines functions tune_linear_regression_model phi.gaussian generate.sampling tune_classification_model regularizer.gr.l2 regularizer.l2 generate.loss.gr.huber generate.loss.huber loss.gr.squared.error loss.squared.error loss.gr.cross.entropy loss.cross.entropy mapXy.gr.linear mapXy.linear mapXy.gr.sigmoid mapXy.sigmoid

Documented in generate.loss.gr.huber generate.loss.huber generate.sampling loss.cross.entropy loss.gr.cross.entropy loss.gr.squared.error loss.squared.error mapXy.gr.linear mapXy.gr.sigmoid mapXy.linear mapXy.sigmoid phi.gaussian regularizer.gr.l2 regularizer.l2 tune_classification_model tune_linear_regression_model

#' Sigmoid Map Function
#'
#' This function implements the sigmoid map function from data X to labels y
#' used for logistic regression in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#'
#' @param X Matrix of data.
#' @param coeff Vector or matrix of coefficients or weights.
#' @return Matrix of values of the sigmoid function corresponding to each row of
#'   X.
#' @examples
#'   X <- matrix(c(1,2,3,4,5,6),nrow=2)
#'   coeff <- c(0.5,-1,2)
#'   mapXy.sigmoid(X,coeff)
#'
#' @keywords internal
#'
#' @export
mapXy.sigmoid <- function(X, coeff) e1071::sigmoid(X%*%coeff)

#' Sigmoid Map Function Gradient
#'
#' This function implements the gradient of the sigmoid map function with
#' respect to coeff used for logistic regression in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#'
#' @param X Matrix of data.
#' @param coeff Vector or matrix of coefficients or weights.
#' @return Matrix of values of the gradient of the sigmoid function with respect
#'   to each value of coeff.
#' @examples
#'   X <- matrix(c(1,2,3,4,5,6),nrow=2)
#'   coeff <- c(0.5,-1,2)
#'   mapXy.gr.sigmoid(X,coeff)
#'
#' @keywords internal
#'
#' @export
mapXy.gr.sigmoid <- function(X, coeff) as.numeric(e1071::dsigmoid(X%*%coeff))*t(X)

#' Linear Map Function
#'
#' This function implements the linear map function from data X to labels y used
#' for linear SVM and linear regression in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}} and
#' \code{\link{EmpiricalRiskMinimizationDP.KST}}.
#'
#' @param X Matrix of data.
#' @param coeff Vector or matrix of coefficients or weights.
#' @return Matrix of values of the linear map function corresponding to each row
#'   of X.
#' @examples
#'   X <- matrix(c(1,2,3,4,5,6),nrow=2)
#'   coeff <- c(0.5,-1,2)
#'   mapXy.linear(X,coeff)
#'
#' @keywords internal
#'
#' @export
mapXy.linear <- function(X, coeff) X%*%coeff

#' Linear Map Function Gradient
#'
#' This function implements the gradient of the linear map function with respect
#' to coeff used for linear SVM and linear regression in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}} and
#' \code{\link{EmpiricalRiskMinimizationDP.KST}}.
#'
#' @param X Matrix of data.
#' @param coeff Vector or matrix of coefficients or weights.
#' @return Matrix of values of the gradient of the linear map function with
#'   respect to each value of coeff.
#' @examples
#'   X <- matrix(c(1,2,3,4,5,6),nrow=2)
#'   coeff <- c(0.5,-1,2)
#'   mapXy.gr.linear(X,coeff)
#'
#' @keywords internal
#'
#' @export
mapXy.gr.linear <- function(X, coeff) t(X)

#' Cross Entropy Loss Function
#'
#' This function implements cross entropy loss used for logistic regression in
#' the form required by \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#'
#' @param y.hat Vector or matrix of estimated labels.
#' @param y Vector or matrix of true labels.
#' @return Vector or matrix of the cross entropy loss for each element of y.hat
#'   and y.
#' @examples
#'   y.hat <- c(0.1, 0.88, 0.02)
#'   y <- c(0, 1, 0)
#'   loss.cross.entropy(y.hat,y)
#'
#' @keywords internal
#'
#' @export
loss.cross.entropy <- function(y.hat,y) -(y*log(y.hat) + (1-y)*log(1-y.hat))

#' Cross Entropy Loss Function Gradient
#'
#' This function implements cross entropy loss gradient with respect to y.hat
#' used for logistic regression in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#'
#' @param y.hat Vector or matrix of estimated labels.
#' @param y Vector or matrix of true labels.
#' @return Vector or matrix of the cross entropy loss gradient for each element
#'   of y.hat and y.
#' @examples
#'   y.hat <- c(0.1, 0.88, 0.02)
#'   y <- c(0, 1, 0)
#'   loss.gr.cross.entropy(y.hat,y)
#'
#' @keywords internal
#'
#' @export
loss.gr.cross.entropy <- function(y.hat,y) -y/y.hat + (1-y)/(1-y.hat)

#' Squared Error Loss Function
#'
#' This function implements squared error loss used for linear regression in
#' the form required by \code{\link{EmpiricalRiskMinimizationDP.KST}}.
#'
#' @param y.hat Vector or matrix of estimated labels.
#' @param y Vector or matrix of true labels.
#' @return Vector or matrix of the squared error loss for each element of y.hat
#'   and y.
#' @examples
#'   y.hat <- c(0.1, 0.88, 0.02)
#'   y <- c(0, 1, 0)
#'   loss.squared.error(y.hat,y)
#'
#' @keywords internal
#'
#' @export
loss.squared.error <- function(y.hat,y) (y.hat-y)^2/2

#' Squared error Loss Function Gradient
#'
#' This function implements the squared error loss gradient with respect to
#' y.hat used for linear regression in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.KST}}.
#'
#' @param y.hat Vector or matrix of estimated values.
#' @param y Vector or matrix of true values.
#' @return Vector or matrix of the squared error loss gradient for each element
#'   of y.hat and y.
#' @examples
#'   y.hat <- c(0.1, 0.88, 0.02)
#'   y <- c(-0.1, 1, .2)
#'   loss.gr.squared.error(y.hat,y)
#'
#' @keywords internal
#'
#' @export
loss.gr.squared.error <- function(y.hat,y) y.hat-y

#' Generator for Huber Loss Function
#'
#' This function generates and returns the Huber loss function used for
#' privacy-preserving SVM at the specified value of h in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#'
#' @param h Positive real number for the Huber loss parameter. Lower values more
#'   closely approximate hinge loss. Higher values produce smoother Huber loss
#'   functions.
#' @return Huber loss function with parameter h in the form required by
#'   \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#' @examples
#'   h <- 0.5
#'   huber <- generate.loss.huber(h)
#'   y.hat <- c(-.5, 1.2, -0.9)
#'   y <- c(-1, 1, -1)
#'   huber(y.hat,y)
#'   huber(y.hat, y, w=c(0.1, 0.5, 1)) # Weights observation-level loss
#'
#' @keywords internal
#'
#' @export
generate.loss.huber <- function(h){
  # Weight vector w included for WeightedERMDP.CMS
  function(y.hat, y, w = NULL){
    if (is.null(w)) w <- rep(1, length(y))
    if (length(w)!=length(y)) stop("Weight vector w must be same length as label vector y.")
    z <- y.hat*y
    mask1 <- z>(1+h)
    mask2 <- abs(1-z)<=h
    mask3 <- z<(1-h)
    z[mask1] <- 0
    z[mask2] <- w[mask2]*(1+h-z[mask2])^2/(4*h)
    z[mask3] <- w[mask3]*(1-z[mask3])
    z
  }
}

#' Generator for Huber Loss Function Gradient
#'
#' This function generates and returns the Huber loss function gradient used for
#' privacy-preserving SVM at the specified value of h in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#'
#' @param h Positive real number for the Huber loss parameter. Lower values more
#'   closely approximate hinge loss. Higher values produce smoother Huber loss
#'   functions.
#' @return Huber loss function gradient with parameter h in the form required by
#'   \code{\link{EmpiricalRiskMinimizationDP.CMS}}.
#' @examples
#'   h <- 1
#'   huber <- generate.loss.gr.huber(h)
#'   y.hat <- c(-.5, 1.2, -0.9)
#'   y <- c(-1, 1, -1)
#'   huber(y.hat,y)
#'   huber(y.hat, y, w=c(0.1, 0.5, 1)) # Weights observation-level loss gradient
#'
#' @keywords internal
#'
#' @export
generate.loss.gr.huber <- function(h){
  function(y.hat, y, w = NULL){
    # Weight vector w included for WeightedERMDP.CMS
    if (is.null(w)) w <- rep(1, length(y))
    if (length(w)!=length(y)) stop("Weight vector w must be same length as label vector y.")
    z <- y.hat*y
    mask1 <- z>(1+h)
    mask2 <- abs(1-z)<=h
    mask3 <- z<(1-h)
    z[mask1] <- 0
    z[mask2] <- -w[mask2]*y[mask2]*(1+h-z[mask2])/(2*h)
    z[mask3] <- -w[mask3]*y[mask3]
    z
  }
}

#' l2 Regularizer
#'
#' This function implements the l2 regularizer in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}} and
#' \code{\link{EmpiricalRiskMinimizationDP.KST}}.
#'
#' @param coeff Vector or matrix of coefficients or weights.
#' @return Regularizer value at the given coefficient vector.
#' @examples
#'   coeff <- c(0.5,-1,2)
#'   regularizer.l2(coeff)
#'
#' @keywords internal
#'
#' @export
regularizer.l2 <- function(coeff) coeff%*%coeff/2

#' l2 Regularizer Gradient
#'
#' This function implements the l2 regularizer gradient in the form required by
#' \code{\link{EmpiricalRiskMinimizationDP.CMS}} and
#' \code{\link{EmpiricalRiskMinimizationDP.KST}}.
#'
#' @param coeff Vector or matrix of coefficients or weights.
#' @return Regularizer gradient value at the given coefficient vector.
#' @examples
#'   coeff <- c(0.5,-1,2)
#'   regularizer.gr.l2(coeff)
#'
#' @keywords internal
#'
#' @export
regularizer.gr.l2 <- function(coeff) coeff

#' Privacy-preserving Hyperparameter Tuning for Binary Classification Models
#'
#' This function implements the privacy-preserving hyperparameter tuning
#' function for binary classification \insertCite{chaudhuri2011}{DPpack} using
#' the exponential mechanism. It accepts a list of models with various chosen
#' hyperparameters, a dataset X with corresponding labels y, upper and lower
#' bounds on the columns of X, and a boolean indicating whether to add bias in
#' the construction of each of the models. The data are split into m+1 equal
#' groups, where m is the number of models being compared. One group is set
#' aside as the validation group, and each of the other m groups are used to
#' train each of the given m models. The number of errors on the validation set
#' is counted for each model and used as the utility values in the exponential
#' mechanism (\code{\link{ExponentialMechanism}}) to select a tuned model in a
#' privacy-preserving way.
#'
#' @param models Vector of binary classification model objects, each initialized
#'   with a different combination of hyperparameter values from the search space
#'   for tuning. Each model should be initialized with the same epsilon privacy
#'   parameter value eps. The tuned model satisfies eps-level differential
#'   privacy.
#' @param X Dataframe of data to be used in tuning the model. Note it is assumed
#'   the data rows and corresponding labels are randomly shuffled.
#' @param y Vector or matrix of true labels for each row of X.
#' @param upper.bounds Numeric vector giving upper bounds on the values in each
#'   column of X. Should be of length ncol(X). The values are assumed to be in
#'   the same order as the corresponding columns of X. Any value in the columns
#'   of X larger than the corresponding upper bound is clipped at the bound.
#' @param lower.bounds Numeric vector giving lower bounds on the values in each
#'   column of X. Should be of length ncol(X). The values are assumed to be in
#'   the same order as the corresponding columns of X. Any value in the columns
#'   of X smaller than the corresponding lower bound is clipped at the bound.
#' @param add.bias Boolean indicating whether to add a bias term to X. Defaults
#'   to FALSE.
#' @param weights Numeric vector of observation weights of the same length as
#'   \code{y}.
#' @param weights.upper.bound Numeric value representing the global or public
#'   upper bound on the weights.
#' @return Single model object selected from the input list models with tuned
#'   parameters.
#' @examples
#' # Build train dataset X and y, and test dataset Xtest and ytest
#' N <- 200
#' K <- 2
#' X <- data.frame()
#' y <- data.frame()
#' for (j in (1:K)){
#'   t <- seq(-.25,.25,length.out = N)
#'   if (j==1) m <- stats::rnorm(N,-.2,.1)
#'   if (j==2) m <- stats::rnorm(N, .2,.1)
#'   Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
#'   ytemp <- data.frame(matrix(j-1, N, 1))
#'   X <- rbind(X, Xtemp)
#'   y <- rbind(y, ytemp)
#' }
#' Xtest <- X[seq(1,(N*K),10),]
#' ytest <- y[seq(1,(N*K),10),,drop=FALSE]
#' X <- X[-seq(1,(N*K),10),]
#' y <- y[-seq(1,(N*K),10),,drop=FALSE]
#' y <- as.matrix(y)
#' weights <- rep(1, nrow(y)) # Uniform weighting
#' weights[nrow(y)] <- 0.5 # half weight for last observation
#' wub <- 1 # Public upper bound for weights
#'
#' # Grid of possible gamma values for tuning logistic regression model
#' grid.search <- c(100, 1, .0001)
#'
#' # Construct objects for SVM parameter tuning
#' eps <- 1 # Privacy budget should be the same for all models
#' svmdp1 <- svmDP$new("l2", eps, grid.search[1], perturbation.method='output')
#' svmdp2 <- svmDP$new("l2", eps, grid.search[2], perturbation.method='output')
#' svmdp3 <- svmDP$new("l2", eps, grid.search[3], perturbation.method='output')
#' models <- c(svmdp1, svmdp2, svmdp3)
#'
#' # Tune using data and bounds for X based on its construction
#' upper.bounds <- c( 1, 1)
#' lower.bounds <- c(-1,-1)
#' tuned.model <- tune_classification_model(models, X, y, upper.bounds,
#'                                          lower.bounds, weights=weights,
#'                                          weights.upper.bound=wub)
#' tuned.model$gamma # Gives resulting selected hyperparameter
#'
#' # tuned.model result can be used the same as a trained LogisticRegressionDP model
#' # Predict new data points
#' predicted.y <- tuned.model$predict(Xtest)
#' n.errors <- sum(predicted.y!=ytest)
#'
#' @references \insertRef{chaudhuri2011}{DPpack}
#'
#' @export
tune_classification_model<- function(models, X, y, upper.bounds, lower.bounds,
                      add.bias=FALSE, weights=NULL, weights.upper.bound=NULL){
  # Make sure values are correct
  m <- length(models)
  n <- length(y)
  # Split data into m+1 groups
  validateX <- X[seq(m+1,n,m+1),,drop=FALSE]
  validatey <- y[seq(m+1,n,m+1)]
  z <- numeric(m)
  for (i in 1:m){
    subX <- X[seq(i,n,m+1),,drop=FALSE]
    suby <- y[seq(i,n,m+1)]
    if (!is.null(weights)) subWeights <- weights[seq(i,n,m+1)]
    if (is.null(weights)){
      models[[i]]$fit(subX,suby,upper.bounds,lower.bounds,add.bias)
    } else{
      models[[i]]$fit(subX,suby,upper.bounds,lower.bounds,add.bias,
                      weights=subWeights, weights.upper.bound=weights.upper.bound)
    }
    validatey.hat <- models[[i]]$predict(validateX,add.bias,raw.value=FALSE)
    z[i] <- sum(validatey!=validatey.hat)
  }
  res <- ExponentialMechanism(-z, models[[1]]$eps, 1, candidates=models)
  res[[1]]
}

#' Privacy-preserving Empirical Risk Minimization for Binary Classification
#'
#' @description This class implements differentially private empirical risk
#'   minimization \insertCite{chaudhuri2011}{DPpack}. Either the output or the
#'   objective perturbation method can be used. It is intended to be a framework
#'   for building more specific models via inheritance. See
#'   \code{\link{LogisticRegressionDP}} for an example of this type of
#'   structure.
#'
#' @details To use this class for empirical risk minimization, first use the
#'   \code{new} method to construct an object of this class with the desired
#'   function values and hyperparameters. After constructing the object, the
#'   \code{fit} method can be applied with a provided dataset and data bounds to
#'   fit the model.  In fitting, the model stores a vector of coefficients
#'   \code{coeff} which satisfy differential privacy. These can be released
#'   directly, or used in conjunction with the \code{predict} method to
#'   privately predict the outcomes of new datapoints.
#'
#'   Note that in order to guarantee differential privacy for empirical risk
#'   minimization, certain constraints must be satisfied for the values used to
#'   construct the object, as well as for the data used to fit. These conditions
#'   depend on the chosen perturbation method. Specifically, the provided loss
#'   function must be convex and differentiable with respect to \code{y.hat},
#'   and the absolute value of the first derivative of the loss function must be
#'   at most 1. If objective perturbation is chosen, the loss function must also
#'   be doubly differentiable and the absolute value of the second derivative of
#'   the loss function must be bounded above by a constant c for all possible
#'   values of \code{y.hat} and \code{y}, where \code{y.hat} is the predicted
#'   label and \code{y} is the true label. The regularizer must be 1-strongly
#'   convex and differentiable. It also must be doubly differentiable if
#'   objective perturbation is chosen. Finally, it is assumed that if x
#'   represents a single row of the dataset X, then the l2-norm of x is at most
#'   1 for all x. Note that because of this, a bias term cannot be included
#'   without appropriate scaling/preprocessing of the dataset. To ensure
#'   privacy, the add.bias argument in the \code{fit} and \code{predict} methods
#'   should only be utilized in subclasses within this package where appropriate
#'   preprocessing is implemented, not in this class.
#'
#' @references \insertRef{chaudhuri2011}{DPpack}
#'
#' @examples
#' # Build train dataset X and y, and test dataset Xtest and ytest
#' N <- 200
#' K <- 2
#' X <- data.frame()
#' y <- data.frame()
#' for (j in (1:K)){
#'   t <- seq(-.25, .25, length.out = N)
#'   if (j==1) m <- stats::rnorm(N,-.2, .1)
#'   if (j==2) m <- stats::rnorm(N, .2, .1)
#'   Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
#'   ytemp <- data.frame(matrix(j-1, N, 1))
#'   X <- rbind(X, Xtemp)
#'   y <- rbind(y, ytemp)
#' }
#' Xtest <- X[seq(1,(N*K),10),]
#' ytest <- y[seq(1,(N*K),10),,drop=FALSE]
#' X <- X[-seq(1,(N*K),10),]
#' y <- y[-seq(1,(N*K),10),,drop=FALSE]
#'
#' # Construct object for logistic regression
#' mapXy <- function(X, coeff) e1071::sigmoid(X%*%coeff)
#' # Cross entropy loss
#' loss <- function(y.hat,y) -(y*log(y.hat) + (1-y)*log(1-y.hat))
#' regularizer <- 'l2' # Alternatively, function(coeff) coeff%*%coeff/2
#' eps <- 1
#' gamma <- 1
#' perturbation.method <- 'objective'
#' c <- 1/4 # Required value for logistic regression
#' mapXy.gr <- function(X, coeff) as.numeric(e1071::dsigmoid(X%*%coeff))*t(X)
#' loss.gr <- function(y.hat, y) -y/y.hat + (1-y)/(1-y.hat)
#' regularizer.gr <- function(coeff) coeff
#' ermdp <- EmpiricalRiskMinimizationDP.CMS$new(mapXy, loss, regularizer, eps,
#'                                              gamma, perturbation.method, c,
#'                                              mapXy.gr, loss.gr,
#'                                              regularizer.gr)
#'
#' # Fit with data
#' # Bounds for X based on construction
#' upper.bounds <- c( 1, 1)
#' lower.bounds <- c(-1,-1)
#' ermdp$fit(X, y, upper.bounds, lower.bounds) # No bias term
#' ermdp$coeff # Gets private coefficients
#'
#' # Predict new data points
#' predicted.y <- ermdp$predict(Xtest)
#' n.errors <- sum(round(predicted.y)!=ytest)
#'
#' @keywords internal
#'
#' @export
EmpiricalRiskMinimizationDP.CMS <- R6::R6Class("EmpiricalRiskMinimizationDP.CMS",
  public=list(
  #' @field mapXy Map function of the form \code{mapXy(X, coeff)} mapping input
  #'   data matrix \code{X} and coefficient vector or matrix \code{coeff} to
  #'   output labels \code{y}.
  mapXy = NULL,
  #' @field mapXy.gr Function representing the gradient of the map function with
  #'   respect to the values in \code{coeff} and of the form \code{mapXy.gr(X,
  #'   coeff)}, where \code{X} is a matrix and \code{coeff} is a matrix or
  #'   numeric vector.
  mapXy.gr = NULL,
  #' @field loss Loss function of the form \code{loss(y.hat, y)}, where
  #'   \code{y.hat} and \code{y} are matrices.
  loss = NULL,
  #' @field loss.gr Function representing the gradient of the loss function with
  #'   respect to \code{y.hat} and of the form \code{loss.gr(y.hat, y)}, where
  #'   \code{y.hat} and \code{y} are matrices.
  loss.gr = NULL,
  #' @field regularizer Regularization function of the form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix.
  regularizer = NULL,
  #' @field regularizer.gr Function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}.
  regularizer.gr = NULL,
  #' @field gamma Nonnegative real number representing the regularization
  #'   constant.
  gamma = NULL,
  #' @field eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  eps = NULL,
  #' @field perturbation.method String indicating whether to use the 'output' or
  #'   the 'objective' perturbation methods \insertCite{chaudhuri2011}{DPpack}.
  perturbation.method = NULL,
  #' @field c Positive real number denoting the upper bound on the absolute
  #'   value of the second derivative of the loss function, as required to
  #'   ensure differential privacy for the objective perturbation method.
  c = NULL,
  #' @field coeff Numeric vector of coefficients for the model.
  coeff = NULL,
  #' @field kernel Value only used in child class \code{\link{svmDP}}. String
  #'   indicating which kernel to use for SVM. Must be one of {'linear',
  #'   'Gaussian'}. If 'linear' (default), linear SVM is used. If 'Gaussian,'
  #'   uses the sampling function corresponding to the Gaussian (radial) kernel
  #'   approximation.
  kernel = NULL,
  #' @field D Value only used in child class \code{\link{svmDP}}. Nonnegative
  #'   integer indicating the dimensionality of the transform space
  #'   approximating the kernel. Higher values of \code{D} provide better kernel
  #'   approximations at a cost of computational efficiency.
  D = NULL,
  #' @field sampling Value only used in child class \code{\link{svmDP}}.
  #'   Sampling function of the form \code{sampling(d)}, where \code{d} is the
  #'   input dimension, returning a (\code{d}+1)-dimensional vector of samples
  #'   corresponding to the Fourier transform of the kernel to be approximated.
  sampling=NULL,
  #' @field phi Value only used in child class \code{\link{svmDP}}. Function of
  #'   the form \code{phi(x, theta)}, where \code{x} is an individual row of the
  #'   original dataset, and theta is a (\code{d}+1)-dimensional vector sampled
  #'   from the Fourier transform of the kernel to be approximated, where
  #'   \code{d} is the dimension of \code{x}. The function returns a numeric
  #'   scalar corresponding to the pre-filtered value at the given row with the
  #'   given sampled vector.
  phi=NULL,
  #' @field kernel.param Value only used in child class \code{\link{svmDP}}.
  #'   Positive real number corresponding to the Gaussian kernel parameter.
  kernel.param=NULL,
  #' @field prefilter Value only used in child class \code{\link{svmDP}}. Matrix
  #'   of pre-filter values used in converting data into transform space.
  prefilter=NULL,
  #' @description Create a new \code{EmpiricalRiskMinimizationDP.CMS} object.
  #' @param mapXy Map function of the form \code{mapXy(X, coeff)} mapping input
  #'   data matrix \code{X} and coefficient vector or matrix \code{coeff} to
  #'   output labels \code{y}. Should return a column matrix of predicted labels
  #'   for each row of \code{X}. See \code{\link{mapXy.sigmoid}} for an example.
  #' @param loss Loss function of the form \code{loss(y.hat, y)}, where
  #'   \code{y.hat} and \code{y} are matrices. Should be defined such that it
  #'   returns a matrix of loss values for each element of \code{y.hat} and
  #'   \code{y}. See \code{\link{loss.cross.entropy}} for an example. It must be
  #'   convex and differentiable, and the absolute value of the first derivative
  #'   of the loss function must be at most 1. Additionally, if the objective
  #'   perturbation method is chosen, it must be doubly differentiable and the
  #'   absolute value of the second derivative of the loss function must be
  #'   bounded above by a constant c for all possible values of \code{y.hat} and
  #'   \code{y}.
  #' @param regularizer String or regularization function. If a string, must be
  #'   'l2', indicating to use l2 regularization. If a function, must have form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix, and
  #'   return the value of the regularizer at \code{coeff}. See
  #'   \code{\link{regularizer.l2}} for an example. Additionally, in order to
  #'   ensure differential privacy, the function must be 1-strongly convex and
  #'   differentiable. If the objective perturbation method is chosen, it must
  #'   also be doubly differentiable.
  #' @param eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  #' @param gamma Nonnegative real number representing the regularization
  #'   constant.
  #' @param perturbation.method String indicating whether to use the 'output' or
  #'   the 'objective' perturbation methods \insertCite{chaudhuri2011}{DPpack}.
  #'   Defaults to 'objective'.
  #' @param c Positive real number denoting the upper bound on the absolute
  #'   value of the second derivative of the loss function, as required to
  #'   ensure differential privacy for the objective perturbation method. This
  #'   input is unnecessary if perturbation.method is 'output', but is required
  #'   if perturbation.method is 'objective'. Defaults to NULL.
  #' @param mapXy.gr Optional function representing the gradient of the map
  #'   function with respect to the values in \code{coeff}. If given, must be of
  #'   the form \code{mapXy.gr(X, coeff)}, where \code{X} is a matrix and
  #'   \code{coeff} is a matrix or numeric vector. Should be defined such that
  #'   the ith row of the output represents the gradient with respect to the ith
  #'   coefficient. See \code{\link{mapXy.gr.sigmoid}} for an example. If not
  #'   given, non-gradient based optimization methods are used to compute the
  #'   coefficient values in fitting the model.
  #' @param loss.gr Optional function representing the gradient of the loss
  #'   function with respect to \code{y.hat} and of the form
  #'   \code{loss.gr(y.hat, y)}, where \code{y.hat} and \code{y} are matrices.
  #'   Should be defined such that the ith row of the output represents the
  #'   gradient of the loss function at the ith set of input values. See
  #'   \code{\link{loss.gr.cross.entropy}} for an example. If not given,
  #'   non-gradient based optimization methods are used to compute the
  #'   coefficient values in fitting the model.
  #' @param regularizer.gr Optional function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}. Should return a vector. See
  #'   \code{\link{regularizer.gr.l2}} for an example. If \code{regularizer} is
  #'   given as a string, this value is ignored. If not given and
  #'   \code{regularizer} is a function, non-gradient based optimization methods
  #'   are used to compute the coefficient values in fitting the model.
  #'
  #' @return A new \code{EmpiricalRiskMinimizationDP.CMS} object.
  initialize = function(mapXy, loss, regularizer, eps, gamma,
                        perturbation.method = 'objective', c = NULL,
                        mapXy.gr = NULL, loss.gr = NULL, regularizer.gr = NULL){
    self$mapXy <- mapXy
    self$mapXy.gr <- mapXy.gr
    self$loss <- loss
    self$loss.gr <- loss.gr
    if (is.character(regularizer)){
      if (regularizer == 'l2') {
        self$regularizer <- regularizer.l2
        self$regularizer.gr <- regularizer.gr.l2
      }
      else stop("regularizer must be one of {'l2'}, or a function.")
    }
    else {
      self$regularizer <- regularizer
      self$regularizer.gr <- regularizer.gr
    }
    self$eps <- eps
    self$gamma <- gamma
    if (perturbation.method != 'output' & perturbation.method != 'objective'){
      stop("perturbation.method must be one of 'output' or 'objective'.")
    }
    self$perturbation.method <- perturbation.method
    if (perturbation.method == 'objective' & is.null(c)){
      stop("c must be given for 'objective' perturbation.method.")
    }
    self$c <- c
  },
  #' @description Fit the differentially private empirical risk minimization
  #'   model. This method runs either the output perturbation or the objective
  #'   perturbation algorithm \insertCite{chaudhuri2011}{DPpack}, depending on
  #'   the value of perturbation.method used to construct the object, to
  #'   generate an objective function. A numerical optimization method is then
  #'   run to find optimal coefficients for fitting the model given the training
  #'   data and hyperparameters. The built-in \code{\link{optim}} function using
  #'   the "BFGS" optimization method is used. If \code{mapXy.gr},
  #'   \code{loss.gr}, and \code{regularizer.gr} are all given in the
  #'   construction of the object, the gradient of the objective function is
  #'   utilized by \code{optim} as well. Otherwise, non-gradient based
  #'   optimization methods are used. The resulting privacy-preserving
  #'   coefficients are stored in \code{coeff}.
  #' @param X Dataframe of data to be fit.
  #' @param y Vector or matrix of true labels for each row of \code{X}.
  #' @param upper.bounds Numeric vector of length \code{ncol(X)} giving upper
  #'   bounds on the values in each column of X. The \code{ncol(X)} values are
  #'   assumed to be in the same order as the corresponding columns of \code{X}.
  #'   Any value in the columns of \code{X} larger than the corresponding upper
  #'   bound is clipped at the bound.
  #' @param lower.bounds Numeric vector of length \code{ncol(X)} giving lower
  #'   bounds on the values in each column of \code{X}. The \code{ncol(X)}
  #'   values are assumed to be in the same order as the corresponding columns
  #'   of \code{X}. Any value in the columns of \code{X} larger than the
  #'   corresponding upper bound is clipped at the bound.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE.
  fit = function(X, y, upper.bounds, lower.bounds, add.bias=FALSE){
    preprocess <- private$preprocess_data(X,y,upper.bounds,lower.bounds,
                                            add.bias)
    X <- preprocess$X
    y <- preprocess$y

    n <- length(y)
    d <- ncol(X)
    if (!is.infinite(self$eps) & self$perturbation.method=='objective'){
      # eps.prime <- self$eps - log(1 + 2*self$c/(n*self$gamma) +
      #                               self$c^2/(n^2*self$gamma^2))
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      eps.prime <- self$eps - log(1 + 2*self$c/self$gamma +
                                    self$c^2/self$gamma^2)
      if (eps.prime > 0) {
        Delta <- 0
      }
      else {
        # Delta <- self$c/(n*(exp(self$eps/4) - 1)) - self$gamma
        # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
        Delta <- self$c/(n*(exp(self$eps/4) - 1)) - self$gamma/n
        eps.prime <- self$eps/2
      }
      beta <- eps.prime/2
      norm.b <- rgamma(1, d, rate=beta)
      direction.b <- stats::rnorm(d);
      direction.b <- direction.b/sqrt(sum(direction.b^2))
      b <- norm.b * direction.b
    } else {
      Delta <- 0
      b <- numeric(d)
    }

    tmp.coeff <- private$optimize_coeff(X, y, Delta, b)
    if (self$perturbation.method=='output'){
      # beta <- n*self$gamma*self$eps/2
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      beta <- self$gamma*self$eps/2
      norm.b <- rgamma(1, d, rate=beta)
      direction.b <- stats::rnorm(d)
      direction.b <- direction.b/sqrt(sum(direction.b^2))
      b <- norm.b * direction.b
      tmp.coeff <- tmp.coeff + b
    }
    self$coeff <- private$postprocess_coeff(tmp.coeff, preprocess)
    invisible(self)
  },
  #' @description Predict label(s) for given \code{X} using the fitted
  #'   coefficients.
  #' @param X Dataframe of data on which to make predictions. Must be of same
  #'   form as \code{X} used to fit coefficients.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE. If add.bias was set to TRUE when fitting the
  #'   coefficients, add.bias should be set to TRUE for predictions.
  #'
  #' @return Matrix of predicted labels corresponding to each row of \code{X}.
  predict = function(X, add.bias=FALSE){
    if (add.bias){
      X <- dplyr::mutate(X, bias=1)
      X <- X[, c(ncol(X), 1:(ncol(X)-1))]
    }
    self$mapXy(as.matrix(X), self$coeff)
  }
), private = list(
  # description Preprocess input data and bounds to ensure they meet the
  #   assumptions necessary for fitting the model. If desired, a bias term can
  #   also be added.
  # param X Dataframe of data to be fit. Will be converted to a matrix.
  # param y Vector or matrix of true labels for each row of X. Will be
  #   converted to a matrix.
  # param upper.bounds Numeric vector of length ncol(X) giving upper bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param lower.bounds Numeric vector of length ncol(X) giving lower bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param add.bias Boolean indicating whether to add a bias term to X.
  # param weights Numeric vector of observation weights of the same length as
  #   \code{y}.
  # param weights.upper.bound Numeric value representing the global or public
  #   upper bound on the weights.
  #
  # return A list of preprocessed values for X, y, upper.bounds, and
  #   lower.bounds for use in the privacy-preserving empirical risk
  #   minimization algorithm.
  preprocess_data = function(X, y, upper.bounds, lower.bounds, add.bias,
                             weights=NULL, weights.upper.bound=NULL){
    # Make sure values are correct
    if (length(upper.bounds)!=ncol(X)) {
      stop("Length of upper.bounds must be equal to the number of columns of X.");
    }
    if (length(lower.bounds)!=ncol(X)) {
      stop("Length of lower.bounds must be equal to the number of columns of X.");
    }

    # Add bias if needed (highly recommended to not do this due to unwanted
    #       regularization of bias term)
    if (add.bias){
      X <- dplyr::mutate(X, bias=1)
      X <- X[, c(ncol(X), 1:(ncol(X)-1))]
      upper.bounds <- c(1,upper.bounds)
      lower.bounds <- c(1,lower.bounds)
    }

    # Make matrices for multiplication purposes
    X <- as.matrix(X,nrow=length(y))
    y <- as.matrix(y,ncol=1)

    # Clip based on provided bounds
    for (i in 1:length(upper.bounds)){
      X[X[,i]>upper.bounds[i],i] <- upper.bounds[i]
      X[X[,i]<lower.bounds[i],i] <- lower.bounds[i]
    }

    list(X=X, y=y, upper.bounds=upper.bounds, lower.bounds=lower.bounds)
  },
  # description Postprocess coefficients obtained by fitting the model using
  #   differential privacy to ensure they match original inputs X and y.
  #   Effectively undoes the processing done in preprocess_data.
  # param coeff Vector of coefficients obtained by fitting the model.
  # param preprocess List of values returned by preprocess_data and used to
  #   process coeff.
  # return The processed coefficient vector.
  postprocess_coeff = function(coeff, preprocess){
    coeff
  },
  # description Run numerical optimization method to find optimal coefficients
  #   for fitting model given training data and hyperparameters. This function
  #   builds the objective function based on the training data, and runs the
  #   built-in optim function using the "BFGS" optimization method. If mapXy.gr,
  #   loss.gr, and regularizer.gr are all given in the construction of the
  #   object, the gradient of the objective function is utilized by optim as
  #   well. Otherwise, non-gradient based methods are used.
  # param X Matrix of data to be fit.
  # param y Vector or matrix of true labels for each row of X.
  # param Delta Nonnegative real number set by the differentially private
  #   algorithm and required for constructing the objective function.
  # param b Numeric perturbation vector randomly drawn by the differentially
  #   private algorithm and required for constructing the objective function.
  # return Vector of fitted coefficients.
  optimize_coeff=function(X, y, Delta, b){
    n <- length(y)
    d <- ncol(X)

    # Get objective function
    objective <- function(par, X, y, Delta, b){
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      as.numeric(sum(self$loss(self$mapXy(X,par),y))/n +
                   self$gamma*self$regularizer(par)/n + t(b)%*%par/n +
                   Delta*par%*%par/2)
    }

    # Get gradient function
    if (!is.null(self$mapXy.gr) && !is.null(self$loss.gr) &&
        !is.null(self$regularizer.gr)) {
      objective.gr <- function(par, X, y, Delta, b){
        # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
        as.numeric(self$mapXy.gr(X,par)%*%self$loss.gr(self$mapXy(X,par),y)/n +
                     self$gamma*self$regularizer.gr(par)/n + b/n + Delta*par)
      }
    }
    else objective.gr <- NULL

    # Run optimization
    coeff0 <- numeric(ncol(X))
    opt.res <- optim(coeff0, fn=objective, gr=objective.gr, method="BFGS",
                     X=X, y=y, Delta=Delta, b=b)
    opt.res$par
  }
))

#' Privacy-preserving Weighted Empirical Risk Minimization
#'
#' @description This class implements differentially private empirical risk
#'   minimization in the case where weighted observation-level losses are
#'   desired (such as weighted SVM \insertCite{Yang2005}{DPpack}). Currently,
#'   only the output perturbation method is implemented.
#'
#' @details To use this class for weighted empirical risk minimization, first
#'   use the \code{new} method to construct an object of this class with the
#'   desired function values and hyperparameters. After constructing the object,
#'   the \code{fit} method can be applied with a provided dataset, data bounds,
#'   weights, and weight bounds to fit the model. In fitting, the model stores a
#'   vector of coefficients \code{coeff} which satisfy differential privacy.
#'   These can be released directly, or used in conjunction with the
#'   \code{predict} method to privately predict the outcomes of new datapoints.
#'
#'   Note that in order to guarantee differential privacy for weighted empirical
#'   risk minimization, certain constraints must be satisfied for the values
#'   used to construct the object, as well as for the data used to fit. These
#'   conditions depend on the chosen perturbation method, though currently only
#'   output perturbation is implemented. Specifically, the provided loss
#'   function must be convex and differentiable with respect to \code{y.hat},
#'   and the absolute value of the first derivative of the loss function must be
#'   at most 1. If objective perturbation is chosen (not currently implemented),
#'   the loss function must also be doubly differentiable and the absolute value
#'   of the second derivative of the loss function must be bounded above by a
#'   constant c for all possible values of \code{y.hat} and \code{y}, where
#'   \code{y.hat} is the predicted label and \code{y} is the true label. The
#'   regularizer must be 1-strongly convex and differentiable. It also must be
#'   doubly differentiable if objective perturbation is chosen. For the data x,
#'   it is assumed that if x represents a single row of the dataset X, then the
#'   l2-norm of x is at most 1 for all x. Note that because of this, a bias term
#'   cannot be included without appropriate scaling/preprocessing of the
#'   dataset. To ensure privacy, the add.bias argument in the \code{fit} and
#'   \code{predict} methods should only be utilized in subclasses within this
#'   package where appropriate preprocessing is implemented, not in this class.
#'   Finally, if weights are provided, they should be nonnegative, of the same
#'   length as y, and be upper bounded by a global or public bound which must
#'   also be provided.
#'
#' @references \insertRef{chaudhuri2011}{DPpack}
#'
#'   \insertRef{Yang2005}{DPpack}
#'
#' @examples
#' # Build train dataset X and y, and test dataset Xtest and ytest
#' N <- 200
#' K <- 2
#' X <- data.frame()
#' y <- data.frame()
#' for (j in (1:K)){
#'   t <- seq(-.25, .25, length.out = N)
#'   if (j==1) m <- stats::rnorm(N,-.2, .1)
#'   if (j==2) m <- stats::rnorm(N, .2, .1)
#'   Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
#'   ytemp <- data.frame(matrix(j-1, N, 1))
#'   X <- rbind(X, Xtemp)
#'   y <- rbind(y, ytemp)
#' }
#' Xtest <- X[seq(1,(N*K),10),]
#' ytest <- y[seq(1,(N*K),10),,drop=FALSE]
#' X <- X[-seq(1,(N*K),10),]
#' y <- y[-seq(1,(N*K),10),,drop=FALSE]
#'
#' # Construct object for weighted linear SVM
#' mapXy <- function(X, coeff) X%*%coeff
#' # Huber loss from DPpack
#' huber.h <- 0.5
#' loss <- generate.loss.huber(huber.h)
#' regularizer <- 'l2' # Alternatively, function(coeff) coeff%*%coeff/2
#' eps <- 1
#' gamma <- 1
#' perturbation.method <- 'output'
#' c <- 1/(2*huber.h) # Required value for SVM
#' mapXy.gr <- function(X, coeff) t(X)
#' loss.gr <- generate.loss.gr.huber(huber.h)
#' regularizer.gr <- function(coeff) coeff
#' wermdp <- WeightedERMDP.CMS$new(mapXy, loss, regularizer, eps,
#'                                 gamma, perturbation.method, c,
#'                                 mapXy.gr, loss.gr,
#'                                 regularizer.gr)
#'
#' # Fit with data
#' # Bounds for X based on construction
#' upper.bounds <- c( 1, 1)
#' lower.bounds <- c(-1,-1)
#' weights <- rep(1, nrow(y)) # Uniform weighting
#' weights[nrow(y)] <- 0.5 # half weight for last observation
#' wub <- 1 # Public upper bound for weights
#' wermdp$fit(X, y, upper.bounds, lower.bounds, weights=weights,
#'            weights.upper.bound=wub)
#' wermdp$coeff # Gets private coefficients
#'
#' # Predict new data points
#' predicted.y <- wermdp$predict(Xtest)
#' n.errors <- sum(round(predicted.y)!=ytest)
#'
#' @keywords internal
#'
#' @export
WeightedERMDP.CMS <- R6::R6Class("WeightedERMDP.CMS",
  inherit=EmpiricalRiskMinimizationDP.CMS,
  public=list(
  #' @description Create a new \code{WeightedERMDP.CMS} object.
  #' @param mapXy Map function of the form \code{mapXy(X, coeff)} mapping input
  #'   data matrix \code{X} and coefficient vector or matrix \code{coeff} to
  #'   output labels \code{y}. Should return a column matrix of predicted labels
  #'   for each row of \code{X}. See \code{\link{mapXy.sigmoid}} for an example.
  #' @param loss Loss function of the form \code{loss(y.hat, y, w)}, where
  #'   \code{y.hat} and \code{y} are matrices and \code{w} is a matrix or vector
  #'   of weights of the same length as \code{y}. Should be defined such that it
  #'   returns a matrix of weighted loss values for each element of \code{y.hat}
  #'   and \code{y}. If \code{w} is not given, the function should operate as if
  #'   uniform weights were given. See \code{\link{generate.loss.huber}} for an
  #'   example. It must be convex and differentiable, and the absolute value of
  #'   the first derivative of the loss function must be at most 1.
  #'   Additionally, if the objective perturbation method is chosen, it must be
  #'   doubly differentiable and the absolute value of the second derivative of
  #'   the loss function must be bounded above by a constant c for all possible
  #'   values of \code{y.hat} and \code{y}.
  #' @param regularizer String or regularization function. If a string, must be
  #'   'l2', indicating to use l2 regularization. If a function, must have form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix, and
  #'   return the value of the regularizer at \code{coeff}. See
  #'   \code{\link{regularizer.l2}} for an example. Additionally, in order to
  #'   ensure differential privacy, the function must be 1-strongly convex and
  #'   differentiable. If the objective perturbation method is chosen, it must
  #'   also be doubly differentiable.
  #' @param eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  #' @param gamma Nonnegative real number representing the regularization
  #'   constant.
  #' @param perturbation.method String indicating whether to use the 'output' or
  #'   the 'objective' perturbation methods \insertCite{chaudhuri2011}{DPpack}.
  #'   Defaults to 'objective'. Currently, only the output perturbation method
  #'   is supported.
  #' @param c Positive real number denoting the upper bound on the absolute
  #'   value of the second derivative of the loss function, as required to
  #'   ensure differential privacy for the objective perturbation method. This
  #'   input is unnecessary if perturbation.method is 'output', but is required
  #'   if perturbation.method is 'objective'. Defaults to NULL.
  #' @param mapXy.gr Optional function representing the gradient of the map
  #'   function with respect to the values in \code{coeff}. If given, must be of
  #'   the form \code{mapXy.gr(X, coeff)}, where \code{X} is a matrix and
  #'   \code{coeff} is a matrix or numeric vector. Should be defined such that
  #'   the ith row of the output represents the gradient with respect to the ith
  #'   coefficient. See \code{\link{mapXy.gr.sigmoid}} for an example. If not
  #'   given, non-gradient based optimization methods are used to compute the
  #'   coefficient values in fitting the model.
  #' @param loss.gr Optional function representing the gradient of the loss
  #'   function with respect to \code{y.hat} and of the form
  #'   \code{loss.gr(y.hat, y, w)}, where \code{y.hat} and \code{y} are matrices
  #'   and \code{w} is a matrix or vector of weights. Should be defined such
  #'   that the ith row of the output represents the gradient of the (possibly
  #'   weighted) loss function at the ith set of input values. See
  #'   \code{\link{generate.loss.gr.huber}} for an example. If not given,
  #'   non-gradient based optimization methods are used to compute the
  #'   coefficient values in fitting the model.
  #' @param regularizer.gr Optional function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}. Should return a vector. See
  #'   \code{\link{regularizer.gr.l2}} for an example. If \code{regularizer} is
  #'   given as a string, this value is ignored. If not given and
  #'   \code{regularizer} is a function, non-gradient based optimization methods
  #'   are used to compute the coefficient values in fitting the model.
  #'
  #' @return A new \code{WeightedERMDP.CMS} object.
  initialize = function(mapXy, loss, regularizer, eps, gamma,
                        perturbation.method = 'objective', c = NULL,
                        mapXy.gr = NULL, loss.gr = NULL, regularizer.gr = NULL){
    super$initialize(mapXy, loss, regularizer, eps, gamma, perturbation.method,
                     c, mapXy.gr, loss.gr, regularizer.gr)
  },
  #' @description Fit the differentially private weighted empirical risk
  #'   minimization model. This method runs either the output perturbation or
  #'   the objective perturbation algorithm \insertCite{chaudhuri2011}{DPpack}
  #'   (only output is currently implemented), depending on the value of
  #'   perturbation.method used to construct the object, to generate an
  #'   objective function. A numerical optimization method is then run to find
  #'   optimal coefficients for fitting the model given the training data,
  #'   weights, and hyperparameters. The built-in \code{\link{optim}} function
  #'   using the "BFGS" optimization method is used. If \code{mapXy.gr},
  #'   \code{loss.gr}, and \code{regularizer.gr} are all given in the
  #'   construction of the object, the gradient of the objective function is
  #'   utilized by \code{optim} as well. Otherwise, non-gradient based
  #'   optimization methods are used. The resulting privacy-preserving
  #'   coefficients are stored in \code{coeff}.
  #' @param X Dataframe of data to be fit.
  #' @param y Vector or matrix of true labels for each row of \code{X}.
  #' @param upper.bounds Numeric vector of length \code{ncol(X)} giving upper
  #'   bounds on the values in each column of X. The \code{ncol(X)} values are
  #'   assumed to be in the same order as the corresponding columns of \code{X}.
  #'   Any value in the columns of \code{X} larger than the corresponding upper
  #'   bound is clipped at the bound.
  #' @param lower.bounds Numeric vector of length \code{ncol(X)} giving lower
  #'   bounds on the values in each column of \code{X}. The \code{ncol(X)}
  #'   values are assumed to be in the same order as the corresponding columns
  #'   of \code{X}. Any value in the columns of \code{X} larger than the
  #'   corresponding upper bound is clipped at the bound.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE.
  #' @param weights Numeric vector of observation weights of the same length as
  #'   \code{y}.
  #' @param weights.upper.bound Numeric value representing the global or public
  #'   upper bound on the weights.
  fit = function(X, y, upper.bounds, lower.bounds, add.bias=FALSE, weights=NULL,
                 weights.upper.bound=NULL){
    if (!is.null(weights) & self$perturbation.method!="output"){
      stop(paste("Currently, weighting ERM samples only tested for SVM with output ",
                 "perturbation and should not be used with objective perturbation."))
    }
    preprocess <- private$preprocess_data(X,y,upper.bounds,lower.bounds,
                                          add.bias,weights,weights.upper.bound)
    X <- preprocess$X
    y <- preprocess$y
    weights <- preprocess$weights
    if (is.null(weights)) weights.upper.bound <- 1

    n <- length(y)
    d <- ncol(X)
    if (!is.infinite(self$eps) & self$perturbation.method=='objective'){
      # eps.prime <- self$eps - log(1 + 2*self$c/(n*self$gamma) +
      #                               self$c^2/(n^2*self$gamma^2))
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      eps.prime <- self$eps - log(1 + 2*self$c/self$gamma +
                                    self$c^2/self$gamma^2)
      if (eps.prime > 0) {
        Delta <- 0
      }
      else {
        # Delta <- self$c/(n*(exp(self$eps/4) - 1)) - self$gamma
        # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
        Delta <- self$c/(n*(exp(self$eps/4) - 1)) - self$gamma/n
        eps.prime <- self$eps/2
      }
      beta <- eps.prime/2
      norm.b <- rgamma(1, d, rate=beta)
      direction.b <- stats::rnorm(d);
      direction.b <- direction.b/sqrt(sum(direction.b^2))
      b <- norm.b * direction.b
    } else {
      Delta <- 0
      b <- numeric(d)
    }

    tmp.coeff <- private$optimize_coeff(X, y, Delta, b, weights)
    if (self$perturbation.method=='output'){
      # beta <- n*self$gamma*self$eps/(2*weights.upper.bound)
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      beta <- self$gamma*self$eps/(2*weights.upper.bound)
      norm.b <- rgamma(1, d, rate=beta)
      direction.b <- stats::rnorm(d)
      direction.b <- direction.b/sqrt(sum(direction.b^2))
      b <- norm.b * direction.b
      tmp.coeff <- tmp.coeff + b
    }
    self$coeff <- private$postprocess_coeff(tmp.coeff, preprocess)
    invisible(self)
  },
  #' @description Predict label(s) for given \code{X} using the fitted
  #'   coefficients.
  #' @param X Dataframe of data on which to make predictions. Must be of same
  #'   form as \code{X} used to fit coefficients.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE. If add.bias was set to TRUE when fitting the
  #'   coefficients, add.bias should be set to TRUE for predictions.
  #'
  #' @return Matrix of predicted labels corresponding to each row of \code{X}.
  predict = function(X, add.bias=FALSE){
    super$predict(X, add.bias)
  }
), private=list(
  # description Preprocess input data and bounds to ensure they meet the
  #   assumptions necessary for fitting the model. If desired, a bias term can
  #   also be added.
  # param X Dataframe of data to be fit. Will be converted to a matrix.
  # param y Vector or matrix of true labels for each row of X. Will be
  #   converted to a matrix.
  # param upper.bounds Numeric vector of length ncol(X) giving upper bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param lower.bounds Numeric vector of length ncol(X) giving lower bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param add.bias Boolean indicating whether to add a bias term to X.
  # param weights Numeric vector of observation weights of the same length as
  #   \code{y}.
  # param weights.upper.bound Numeric value representing the global or public
  #   upper bound on the weights.
  # return A list of preprocessed values for X, y, upper.bounds, and
  #   lower.bounds for use in the privacy-preserving empirical risk
  #   minimization algorithm.
  preprocess_data = function(X, y, upper.bounds, lower.bounds, add.bias,
                             weights=NULL, weights.upper.bound=NULL){
    res <- super$preprocess_data(X, y, upper.bounds, lower.bounds, add.bias)

    # Handle weights
    if (!is.null(weights)){
      if (is.null(weights.upper.bound)){
        stop("Upper bound on weights must be given if weights vector given.")
      }

      if (any(weights<0)) stop("Weights must be nonnegative.")

      weights[weights>weights.upper.bound] <- weights.upper.bound
    }
    list(X=res$X, y=res$y, upper.bounds=res$upper.bounds,
         lower.bounds=res$lower.bounds, weights=weights)
  },
  # description Postprocess coefficients obtained by fitting the model using
  #   differential privacy to ensure they match original inputs X and y.
  #   Effectively undoes the processing done in preprocess_data.
  # param coeff Vector of coefficients obtained by fitting the model.
  # param preprocess List of values returned by preprocess_data and used to
  #   process coeff.
  # return The processed coefficient vector.
  postprocess_coeff = function(coeff, preprocess){
    super$postprocess_coeff(coeff, preprocess)
  },
  # description Run numerical optimization method to find optimal coefficients
  #   for fitting model given training data, observation weights, and
  #   hyperparameters. This function builds the objective function based on the
  #   training data and observation weights, and runs the built-in optim function
  #   using the "BFGS" optimization method. If mapXy.gr, loss.gr, and
  #   regularizer.gr are all given in the construction of the object, the gradient
  #   of the objective function is utilized by optim as well. Otherwise,
  #   non-gradient based methods are used.
  # param X Matrix of data to be fit.
  # param y Vector or matrix of true labels for each row of X.
  # param Delta Nonnegative real number set by the differentially private
  #   algorithm and required for constructing the objective function.
  # param b Numeric perturbation vector randomly drawn by the differentially
  #   private algorithm and required for constructing the objective function.
  # param weights Numeric vector of observation weights of the same length as
  #   \code{y}.
  # return Vector of fitted coefficients.
  optimize_coeff=function(X, y, Delta, b, weights){
    n <- length(y)
    d <- ncol(X)

    # Get objective function
    objective <- function(par, X, y, Delta, b, weights){
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      as.numeric(sum(self$loss(self$mapXy(X,par),y, weights))/n +
                   self$gamma*self$regularizer(par)/n + t(b)%*%par/n +
                   Delta*par%*%par/2)
    }

    # Get gradient function
    if (!is.null(self$mapXy.gr) && !is.null(self$loss.gr) &&
        !is.null(self$regularizer.gr)) {
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      objective.gr <- function(par, X, y, Delta, b, weights){
        as.numeric(self$mapXy.gr(X,par)%*%
                     self$loss.gr(self$mapXy(X,par), y, weights)/n +
                     self$gamma*self$regularizer.gr(par)/n + b/n + Delta*par)
      }
    }
    else objective.gr <- NULL

    # Run optimization
    coeff0 <- numeric(ncol(X))
    opt.res <- optim(coeff0, fn=objective, gr=objective.gr, method="BFGS",
                     X=X, y=y, Delta=Delta, b=b, weights=weights)
    opt.res$par
  }
))

#' Privacy-preserving Logistic Regression
#'
#' @description This class implements differentially private logistic regression
#'   \insertCite{chaudhuri2011}{DPpack}. Either the output or the objective
#'   perturbation method can be used.
#'
#' @details To use this class for logistic regression, first use the \code{new}
#'   method to construct an object of this class with the desired function
#'   values and hyperparameters. After constructing the object, the \code{fit}
#'   method can be applied with a provided dataset and data bounds to fit the
#'   model. In fitting, the model stores a vector of coefficients \code{coeff}
#'   which satisfy differential privacy. These can be released directly, or used
#'   in conjunction with the \code{predict} method to privately predict the
#'   outcomes of new datapoints.
#'
#'   Note that in order to guarantee differential privacy for logistic
#'   regression, certain constraints must be satisfied for the values used to
#'   construct the object, as well as for the data used to fit. These conditions
#'   depend on the chosen perturbation method. The regularizer must be
#'   1-strongly convex and differentiable. It also must be doubly differentiable
#'   if objective perturbation is chosen. Additionally, it is assumed that if x
#'   represents a single row of the dataset X, then the l2-norm of x is at most
#'   1 for all x. In order to ensure this constraint is satisfied, the dataset
#'   is preprocessed and scaled, and the resulting coefficients are
#'   postprocessed and un-scaled so that the stored coefficients correspond to
#'   the original data. Due to this constraint on x, it is best to avoid using a
#'   bias term in the model whenever possible. If a bias term must be used, the
#'   issue can be partially circumvented by adding a constant column to X before
#'   fitting the model, which will be scaled along with the rest of X. The
#'   \code{fit} method contains functionality to add a column of constant 1s to
#'   X before scaling, if desired.
#'
#' @references \insertRef{chaudhuri2011}{DPpack}
#'
#'   \insertRef{Chaudhuri2009}{DPpack}
#'
#' @examples
#' # Build train dataset X and y, and test dataset Xtest and ytest
#' N <- 200
#' K <- 2
#' X <- data.frame()
#' y <- data.frame()
#' for (j in (1:K)){
#'   t <- seq(-.25, .25, length.out = N)
#'   if (j==1) m <- stats::rnorm(N,-.2, .1)
#'   if (j==2) m <- stats::rnorm(N, .2, .1)
#'   Xtemp <- data.frame(x1 = 3*t , x2 = m - t)
#'   ytemp <- data.frame(matrix(j-1, N, 1))
#'   X <- rbind(X, Xtemp)
#'   y <- rbind(y, ytemp)
#' }
#' Xtest <- X[seq(1,(N*K),10),]
#' ytest <- y[seq(1,(N*K),10),,drop=FALSE]
#' X <- X[-seq(1,(N*K),10),]
#' y <- y[-seq(1,(N*K),10),,drop=FALSE]
#'
#' # Construct object for logistic regression
#' regularizer <- 'l2' # Alternatively, function(coeff) coeff%*%coeff/2
#' eps <- 1
#' gamma <- 1
#' lrdp <- LogisticRegressionDP$new(regularizer, eps, gamma)
#'
#' # Fit with data
#' # Bounds for X based on construction
#' upper.bounds <- c( 1, 1)
#' lower.bounds <- c(-1,-1)
#' lrdp$fit(X, y, upper.bounds, lower.bounds) # No bias term
#' lrdp$coeff # Gets private coefficients
#'
#' # Predict new data points
#' predicted.y <- lrdp$predict(Xtest)
#' n.errors <- sum(predicted.y!=ytest)
#'
#' @export
LogisticRegressionDP <- R6::R6Class("LogisticRegressionDP",
  inherit=EmpiricalRiskMinimizationDP.CMS,
  public=list(
  #' @description Create a new \code{LogisticRegressionDP} object.
  #' @param regularizer String or regularization function. If a string, must be
  #'   'l2', indicating to use l2 regularization. If a function, must have form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix, and
  #'   return the value of the regularizer at \code{coeff}. See
  #'   \code{\link{regularizer.l2}} for an example. Additionally, in order to
  #'   ensure differential privacy, the function must be 1-strongly convex and
  #'   doubly differentiable.
  #' @param eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  #' @param gamma Nonnegative real number representing the regularization
  #'   constant.
  #' @param perturbation.method String indicating whether to use the 'output' or
  #'   the 'objective' perturbation methods \insertCite{chaudhuri2011}{DPpack}.
  #'   Defaults to 'objective'.
  #' @param regularizer.gr Optional function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}. Should return a vector. See
  #'   \code{\link{regularizer.gr.l2}} for an example. If \code{regularizer} is
  #'   given as a string, this value is ignored. If not given and
  #'   \code{regularizer} is a function, non-gradient based optimization methods
  #'   are used to compute the coefficient values in fitting the model.
  #'
  #' @return A new \code{LogisticRegressionDP} object.
  initialize = function(regularizer, eps, gamma,
                        perturbation.method = 'objective',
                        regularizer.gr = NULL){
    super$initialize(mapXy.sigmoid, loss.cross.entropy, regularizer, eps,
                    gamma, perturbation.method, 1/4, mapXy.gr.sigmoid,
                    loss.gr.cross.entropy, regularizer.gr)
  },
  #' @description Fit the differentially private logistic regression model. This
  #'   method runs either the output perturbation or the objective perturbation
  #'   algorithm \insertCite{chaudhuri2011}{DPpack}, depending on the value of
  #'   perturbation.method used to construct the object, to generate an
  #'   objective function. A numerical optimization method is then run to find
  #'   optimal coefficients for fitting the model given the training data and
  #'   hyperparameters. The built-in \code{\link{optim}} function using the
  #'   "BFGS" optimization method is used. If \code{regularizer} is given as
  #'   'l2' or if \code{regularizer.gr} is given in the construction of the
  #'   object, the gradient of the objective function is utilized by
  #'   \code{optim} as well. Otherwise, non-gradient based optimization methods
  #'   are used. The resulting privacy-preserving coefficients are stored in
  #'   \code{coeff}.
  #' @param X Dataframe of data to be fit.
  #' @param y Vector or matrix of true labels for each row of \code{X}.
  #' @param upper.bounds Numeric vector of length \code{ncol(X)} giving upper
  #'   bounds on the values in each column of X. The \code{ncol(X)} values are
  #'   assumed to be in the same order as the corresponding columns of \code{X}.
  #'   Any value in the columns of \code{X} larger than the corresponding upper
  #'   bound is clipped at the bound.
  #' @param lower.bounds Numeric vector of length \code{ncol(X)} giving lower
  #'   bounds on the values in each column of \code{X}. The \code{ncol(X)}
  #'   values are assumed to be in the same order as the corresponding columns
  #'   of \code{X}. Any value in the columns of \code{X} larger than the
  #'   corresponding upper bound is clipped at the bound.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE.
  fit = function(X, y, upper.bounds, lower.bounds, add.bias=FALSE){
    super$fit(X,y,upper.bounds,lower.bounds,add.bias)
  },
  #' @description Predict label(s) for given \code{X} using the fitted
  #'   coefficients.
  #' @param X Dataframe of data on which to make predictions. Must be of same
  #'   form as \code{X} used to fit coefficients.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE. If add.bias was set to TRUE when fitting the
  #'   coefficients, add.bias should be set to TRUE for predictions.
  #' @param raw.value Boolean indicating whether to return the raw predicted
  #'   value or the rounded class label. If FALSE (default), outputs the
  #'   predicted labels 0 or 1. If TRUE, returns the raw score from the logistic
  #'   regression.
  #'
  #' @return Matrix of predicted labels or scores corresponding to each row of
  #'   \code{X}.
  predict = function(X, add.bias=FALSE, raw.value=FALSE){
      if (raw.value) super$predict(X, add.bias)
      else round(super$predict(X, add.bias))
    }
), private=list(
  # description Preprocess input data and bounds to ensure they meet the
  #   assumptions necessary for fitting the model. If desired, a bias term can
  #   also be added.
  # param X Dataframe of data to be fit. Will be converted to a matrix.
  # param y Vector or matrix of true labels for each row of X. Will be
  #   converted to a matrix.
  # param upper.bounds Numeric vector of length ncol(X) giving upper bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param lower.bounds Numeric vector of length ncol(X) giving lower bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param add.bias Boolean indicating whether to add a bias term to X.
  # return A list of preprocessed values for X, y, upper.bounds, and
  #   lower.bounds for use in the privacy-preserving logistic regression
  #   algorithm.
  preprocess_data = function(X, y, upper.bounds, lower.bounds, add.bias){
    res <- super$preprocess_data(X, y, upper.bounds, lower.bounds, add.bias)
    X <- res$X
    y <- res$y
    upper.bounds <- res$upper.bounds
    lower.bounds <- res$lower.bounds

    # Process X
    p <- length(lower.bounds)
    tmp <- matrix(c(lower.bounds,upper.bounds),ncol=2)
    scale1 <- apply(abs(tmp),1,max)
    scale2 <- sqrt(p)
    X.norm <- t(t(X)/scale1)/scale2

    # Process y.
    # Labels must be 0 and 1
    if (any((y!=1) & (y!=0))) stop("y must be a numeric vector of binary labels
                                    0 and 1.")

    list(X=X.norm, y=y, scale1=scale1, scale2=scale2)
  },
  # description Postprocess coefficients obtained by fitting the model using
  #   differential privacy to ensure they match original inputs X and y.
  #   Effectively undoes the processing done in preprocess_data.
  # param coeff Vector of coefficients obtained by fitting the model.
  # param preprocess List of values returned by preprocess_data and used to
  #   process coeff.
  # return The processed coefficient vector.
  postprocess_coeff = function(coeff, preprocess){
    coeff/(preprocess$scale1*preprocess$scale2)
  },
  # description Run numerical optimization method to find optimal coefficients
  #   for fitting model given training data and hyperparameters. This function
  #   builds the objective function based on the training data, and runs the
  #   built-in optim function using the "BFGS" optimization method. If mapXy.gr,
  #   loss.gr, and regularizer.gr are all given in the construction of the
  #   object, the gradient of the objective function is utilized by optim as
  #   well. Otherwise, non-gradient based optimization methods are used.
  # param X Matrix of data to be fit.
  # param y Vector or matrix of true labels for each row of X.
  # param Delta Nonnegative real number set by the differentially private
  #   algorithm and required for constructing the objective function.
  # param b Numeric perturbation vector randomly drawn by the differentially
  #   private algorithm and required for constructing the objective function.
  # return Vector of fitted coefficients.
  optimize_coeff = function(X, y, Delta, b){
    # Repeated implementation to avoid numerical gradient issues
    n <- length(y)
    d <- ncol(X)

    # Get objective function
    objective <- function(par, X, y, gamma, Delta, b){
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      as.numeric(sum(self$loss(self$mapXy(X,par),y))/n +
                   gamma*self$regularizer(par)/n + t(b)%*%par/n +
                   Delta*par%*%par/2)
    }

    # Get gradient function
    if (!is.null(self$mapXy.gr) && !is.null(self$loss.gr) &&
        !is.null(self$regularizer.gr)) {
      # NOTE: Gamma from Chaudhuri's paper is gamma/n in this implementation
      objective.gr <- function(par, X, y, gamma, Delta, b){
        as.numeric(t(X)%*%(self$mapXy(X, par)-y)/n +
                     gamma*self$regularizer.gr(par)/n + b/n + Delta*par)
      }
    }
    else objective.gr <- NULL

    # Run optimization
    coeff0 <- numeric(ncol(X))
    opt.res <- optim(coeff0, fn=objective, gr=objective.gr, method="BFGS",
                     X=X, y=y, gamma=gamma, Delta=Delta, b=b)
    opt.res$par
  }
))

#' Generator for Sampling Distribution Function for Gaussian Kernel
#'
#' This function generates and returns a sampling function corresponding to the
#' Fourier transform of a Gaussian kernel with given parameter
#' \insertCite{chaudhuri2011}{DPpack} of form needed for \code{\link{svmDP}}.
#'
#' @param kernel.param Positive real number for the Gaussian (radial) kernel
#'   parameter.
#' @return Sampling function for the Gaussian kernel of form required by
#'   \code{\link{svmDP}}.
#' @examples
#'   kernel.param <- 0.1
#'   sample <- generate.sampling(kernel.param)
#'   d <- 5
#'   sample(d)
#'
#' @keywords internal
#'
#' @references \insertRef{chaudhuri2011}{DPpack}
#'
#' @export
generate.sampling <- function(kernel.param){
  function(d){
    omega <- stats::rnorm(d,sd=sqrt(2*kernel.param))
    phi <- stats::runif(1,-pi,pi)
    c(omega,phi)
  }
}

#' Transform Function for Gaussian Kernel Approximation
#'
#' This function maps an input data row x with a given prefilter to an output
#' value in such a way as to approximate the Gaussian kernel
#' \insertCite{chaudhuri2011}{DPpack}.
#'
#' @param x Vector or matrix corresponding to one row of the dataset X.
#' @param theta Randomly sampled prefilter vector of length n+1, where n is the
#'   length of x.
#' @return Mapped value corresponding to one element of the transformed space.
#' @examples
#'   x <- c(1,2,3)
#'   theta <- c(0.1, 1.1, -0.8, 3)
#'   phi.gaussian(x, theta)
#'
#' @keywords internal
#'
#' @references \insertRef{chaudhuri2011}{DPpack}
#'
#' @export
phi.gaussian <- function(x, theta){
  d <- length(x)
  cos(x%*%theta[1:d] + theta[d+1])
}

#' Privacy-preserving Support Vector Machine
#'
#' @description This class implements differentially private support vector
#'   machine (SVM) \insertCite{chaudhuri2011}{DPpack}. It can be either weighted
#'   \insertCite{Yang2005}{DPpack} or unweighted. Either the output or the
#'   objective perturbation method can be used for unweighted SVM, though only
#'   the output perturbation method is currently supported for weighted SVM.
#'
#' @details To use this class for SVM, first use the \code{new} method to
#'   construct an object of this class with the desired function values and
#'   hyperparameters, including a choice of the desired kernel. After
#'   constructing the object, the \code{fit} method can be applied to fit the
#'   model with a provided dataset, data bounds, and optional observation
#'   weights and weight upper bound. In fitting, the model stores a vector of
#'   coefficients \code{coeff} which satisfy differential privacy. Additionally,
#'   if a nonlinear kernel is chosen, the models stores a mapping function from
#'   the input data X to a higher dimensional embedding V in the form of a
#'   method \code{XtoV} as required \insertCite{chaudhuri2011}{DPpack}. These
#'   can be released directly, or used in conjunction with the \code{predict}
#'   method to privately predict the label of new datapoints. Note that the
#'   mapping function \code{XtoV} is based on an approximation method via
#'   Fourier transforms \insertCite{Rahimi2007,Rahimi2008}{DPpack}.
#'
#'   Note that in order to guarantee differential privacy for the SVM model,
#'   certain constraints must be satisfied for the values used to construct the
#'   object, as well as for the data used to fit. These conditions depend on the
#'   chosen perturbation method. First, the loss function is assumed to be
#'   differentiable (and doubly differentiable if the objective perturbation
#'   method is used). The hinge loss, which is typically used for SVM, is not
#'   differentiable at 1. Thus, to satisfy this constraint, this class utilizes
#'   the Huber loss, a smooth approximation to the hinge loss
#'   \insertCite{Chapelle2007}{DPpack}. The level of approximation to the hinge
#'   loss is determined by a user-specified constant, h, which defaults to 0.5,
#'   a typical value. Additionally, the regularizer must be 1-strongly convex
#'   and differentiable. It also must be doubly differentiable if objective
#'   perturbation is chosen. If weighted SVM is desired, the provided weights
#'   must be nonnegative and bounded above by a global or public value, which
#'   must also be provided.
#'
#'   Finally, it is assumed that if x represents a single row of the dataset X,
#'   then the l2-norm of x is at most 1 for all x. In order to ensure this
#'   constraint is satisfied, the dataset is preprocessed and scaled, and the
#'   resulting coefficients are postprocessed and un-scaled so that the stored
#'   coefficients correspond to the original data. Due to this constraint on x,
#'   it is best to avoid using a bias term in the model whenever possible. If a
#'   bias term must be used, the issue can be partially circumvented by adding a
#'   constant column to X before fitting the model, which will be scaled along
#'   with the rest of X. The \code{fit} method contains functionality to add a
#'   column of constant 1s to X before scaling, if desired.
#'
#' @references \insertRef{chaudhuri2011}{DPpack}
#'
#'   \insertRef{Yang2005}{DPpack}
#'
#'   \insertRef{Chapelle2007}{DPpack}
#'
#'   \insertRef{Rahimi2007}{DPpack}
#'
#'   \insertRef{Rahimi2008}{DPpack}
#'
#' @examples
#' # Build train dataset X and y, and test dataset Xtest and ytest
#' N <- 400
#' X <- data.frame()
#' y <- data.frame()
#' for (i in (1:N)){
#'   Xtemp <- data.frame(x1 = stats::rnorm(1,sd=.28) , x2 = stats::rnorm(1,sd=.28))
#'   if (sum(Xtemp^2)<.15) ytemp <- data.frame(y=0)
#'   else ytemp <- data.frame(y=1)
#'   X <- rbind(X, Xtemp)
#'   y <- rbind(y, ytemp)
#' }
#' Xtest <- X[seq(1,N,10),]
#' ytest <- y[seq(1,N,10),,drop=FALSE]
#' X <- X[-seq(1,N,10),]
#' y <- y[-seq(1,N,10),,drop=FALSE]
#'
#' # Construct object for SVM
#' regularizer <- 'l2' # Alternatively, function(coeff) coeff%*%coeff/2
#' eps <- 1
#' gamma <- 1
#' perturbation.method <- 'output'
#' kernel <- 'Gaussian'
#' D <- 20
#' svmdp <- svmDP$new(regularizer, eps, gamma, perturbation.method,
#'                    kernel=kernel, D=D)
#'
#' # Fit with data
#' # Bounds for X based on construction
#' upper.bounds <- c( 1, 1)
#' lower.bounds <- c(-1,-1)
#' weights <- rep(1, nrow(y)) # Uniform weighting
#' weights[nrow(y)] <- 0.5 # Half weight for last observation
#' wub <- 1 # Public upper bound for weights
#' svmdp$fit(X, y, upper.bounds, lower.bounds, weights=weights,
#'           weights.upper.bound=wub) # No bias term
#'
#' # Predict new data points
#' predicted.y <- svmdp$predict(Xtest)
#' n.errors <- sum(predicted.y!=ytest)
#'
#' @export
svmDP <- R6::R6Class("svmDP",
  inherit=WeightedERMDP.CMS,
  public=list(
  #' @description Create a new \code{svmDP} object.
  #' @param regularizer String or regularization function. If a string, must be
  #'   'l2', indicating to use l2 regularization. If a function, must have form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix, and
  #'   return the value of the regularizer at \code{coeff}. See
  #'   \code{\link{regularizer.l2}} for an example. Additionally, in order to
  #'   ensure differential privacy, the function must be 1-strongly convex and
  #'   doubly differentiable.
  #' @param eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  #' @param gamma Nonnegative real number representing the regularization
  #'   constant.
  #' @param perturbation.method String indicating whether to use the 'output' or
  #'   the 'objective' perturbation methods \insertCite{chaudhuri2011}{DPpack}.
  #'   Defaults to 'objective'.
  #' @param kernel String indicating which kernel to use for SVM. Must be one of
  #'   {'linear', 'Gaussian'}. If 'linear' (default), linear SVM is used. If
  #'   'Gaussian,' uses the sampling function corresponding to the Gaussian
  #'   (radial) kernel approximation.
  #' @param D Nonnegative integer indicating the dimensionality of the transform
  #'   space approximating the kernel if a nonlinear kernel is used. Higher
  #'   values of D provide better kernel approximations at a cost of
  #'   computational efficiency. This value must be specified if a nonlinear
  #'   kernel is used.
  #' @param kernel.param Positive real number corresponding to the Gaussian
  #'   kernel parameter. Defaults to 1/p, where p is the number of predictors.
  #' @param regularizer.gr Optional function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}. Should return a vector. See
  #'   \code{\link{regularizer.gr.l2}} for an example. If \code{regularizer} is
  #'   given as a string, this value is ignored. If not given and
  #'   \code{regularizer} is a function, non-gradient based optimization methods
  #'   are used to compute the coefficient values in fitting the model.
  #' @param huber.h Positive real number indicating the degree to which the
  #'   Huber loss approximates the hinge loss. Defaults to 0.5
  #'   \insertCite{Chapelle2007}{DPpack}.
  #'
  #' @return A new svmDP object.
  initialize = function(regularizer, eps, gamma,
                        perturbation.method = 'objective', kernel='linear',
                        D=NULL, kernel.param=NULL, regularizer.gr=NULL,
                        huber.h=0.5){
    super$initialize(mapXy.linear, generate.loss.huber(huber.h), regularizer, eps,
                     gamma, perturbation.method, 1/(2*huber.h), mapXy.gr.linear,
                     generate.loss.gr.huber(huber.h), regularizer.gr)
    self$kernel <- kernel
    if (kernel=="Gaussian"){
      self$phi <- phi.gaussian
      self$kernel.param <- kernel.param
      if (is.null(D)) stop("D must be specified for nonlinear kernel.")
      self$D <- D
    } else if (kernel!="linear") stop("kernel must be one of {'linear',
                                      Gaussian'}")
  },
  #' @description Fit the differentially private SVM model. This method runs
  #'   either the output perturbation or the objective perturbation algorithm
  #'   \insertCite{chaudhuri2011}{DPpack}, depending on the value of
  #'   perturbation.method used to construct the object, to generate an
  #'   objective function. A numerical optimization method is then run to find
  #'   optimal coefficients for fitting the model given the training data,
  #'   weights, and hyperparameters. The built-in \code{\link{optim}} function
  #'   using the "BFGS" optimization method is used. If \code{regularizer} is
  #'   given as 'l2' or if \code{regularizer.gr} is given in the construction of
  #'   the object, the gradient of the objective function is utilized by
  #'   \code{optim} as well. Otherwise, non-gradient based optimization methods
  #'   are used. The resulting privacy-preserving coefficients are stored in
  #'   \code{coeff}.
  #' @param X Dataframe of data to be fit.
  #' @param y Vector or matrix of true labels for each row of \code{X}.
  #' @param upper.bounds Numeric vector of length \code{ncol(X)} giving upper
  #'   bounds on the values in each column of X. The \code{ncol(X)} values are
  #'   assumed to be in the same order as the corresponding columns of \code{X}.
  #'   Any value in the columns of \code{X} larger than the corresponding upper
  #'   bound is clipped at the bound.
  #' @param lower.bounds Numeric vector of length \code{ncol(X)} giving lower
  #'   bounds on the values in each column of \code{X}. The \code{ncol(X)}
  #'   values are assumed to be in the same order as the corresponding columns
  #'   of \code{X}. Any value in the columns of \code{X} larger than the
  #'   corresponding upper bound is clipped at the bound.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE.
  #' @param weights Numeric vector of observation weights of the same length as
  #'   \code{y}. If not given, no observation weighting is performed.
  #' @param weights.upper.bound Numeric value representing the global or public
  #'   upper bound on the weights. Required if weights are given.
  fit = function(X, y, upper.bounds, lower.bounds, add.bias=FALSE, weights=NULL,
                 weights.upper.bound=NULL){
    if (!is.null(weights) & self$perturbation.method!="output"){
      stop("Weighted SVM is only implemented for output perturbation.")
    }
    if (self$kernel=='Gaussian'){
      if (is.null(self$kernel.param)) self$kernel.param <- 1/ncol(X)
      self$sampling <- generate.sampling(self$kernel.param)
    }
    super$fit(X, y, upper.bounds, lower.bounds, add.bias, weights,
              weights.upper.bound)
  },
  #' @description Convert input data X into transformed data V. Uses sampled
  #'   pre-filter values and a mapping function based on the chosen kernel to
  #'   produce D-dimensional data V on which to train the model or predict
  #'   future values. This method is only used if the kernel is nonlinear. See
  #'   \insertCite{chaudhuri2011;textual}{DPpack} for more details.
  #' @param X Matrix corresponding to the original dataset.
  #' @return Matrix V of size n by D representing the transformed dataset, where
  #'   n is the number of rows of X, and D is the provided transformed space
  #'   dimension.
  XtoV = function(X){
    # Get V (higher dimensional X)
    V <- matrix(NaN, nrow=nrow(X), ncol=self$D)
    for (i in 1:nrow(X)){
      for (j in 1:self$D){
        V[i,j] <- sqrt(1/self$D)*self$phi(X[i,],self$prefilter[j,])
      }
    }
    V
  },
  #' @description Predict label(s) for given \code{X} using the fitted
  #'   coefficients.
  #' @param X Dataframe of data on which to make predictions. Must be of same
  #'   form as \code{X} used to fit coefficients.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE. If add.bias was set to TRUE when fitting the
  #'   coefficients, add.bias should be set to TRUE for predictions.
  #' @param raw.value Boolean indicating whether to return the raw predicted
  #'   value or the rounded class label. If FALSE (default), outputs the
  #'   predicted labels 0 or 1. If TRUE, returns the raw score from the SVM
  #'   model.
  #'
  #' @return Matrix of predicted labels or scores corresponding to each row of
  #'   \code{X}.
  predict = function(X, add.bias=FALSE, raw.value=FALSE){
    if (self$kernel!="linear"){
      if (add.bias){
        X <- dplyr::mutate(X, bias =1)
        X <- X[, c(ncol(X), 1:(ncol(X)-1))]
      }
      V <- self$XtoV(as.matrix(X))
      if (raw.value) super$predict(V, FALSE)
      else {
        vals <- sign(super$predict(V, FALSE))
        vals[vals==-1] <- 0
        vals
      }
    } else{
      if (raw.value) super$predict(X, add.bias)
      else {
        vals <- sign(super$predict(X, add.bias))
        vals[vals==-1] <- 0
        vals
      }
    }
  }
), private=list(
  # description Preprocess input data and bounds to ensure they meet the
  #   assumptions necessary for fitting the model. If desired, a bias term can
  #   also be added.
  # param X Dataframe of data to be fit. Will be converted to a matrix.
  # param y Vector or matrix of true labels for each row of X. Will be
  #   converted to a matrix.
  # param upper.bounds Numeric vector of length ncol(X) giving upper bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param lower.bounds Numeric vector of length ncol(X) giving lower bounds on
  #   the values in each column of X. The ncol(X) values are assumed to be in
  #   the same order as the corresponding columns of X. Any value in the
  #   columns of X larger than the corresponding upper bound is clipped at the
  #   bound.
  # param add.bias Boolean indicating whether to add a bias term to X.
  # param weights Numeric vector of observation weights of the same length as
  #   \code{y}.
  # param weights.upper.bound Numeric value representing the global or public
  #   upper bound on the weights.
  # return A list of preprocessed values for X, y, upper.bounds, and
  #   lower.bounds for use in the privacy-preserving SVM algorithm.
  preprocess_data = function(X, y, upper.bounds, lower.bounds, add.bias,
                             weights=NULL, weights.upper.bound=NULL){
    if (self$kernel!="linear"){
      # Add bias if needed (highly recommended to not do this due to unwanted
      #       regularization of bias term)
      if (add.bias){
        X <- dplyr::mutate(X, bias=1)
        X <- X[, c(ncol(X), 1:(ncol(X)-1))]
        upper.bounds <- c(1,upper.bounds)
        lower.bounds <- c(1,lower.bounds)
      }

      # Make matrices for multiplication purposes
      X <- as.matrix(X,nrow=length(y))
      y <- as.matrix(y,ncol=1)

      # Get prefilter
      d <- ncol(X)
      prefilter <- matrix(NaN, nrow=self$D, ncol=(d+1))
      for (i in 1:self$D){
        prefilter[i,] <- self$sampling(d)
      }
      self$prefilter <- prefilter

      V <- self$XtoV(X)

      lbs <- c(numeric(ncol(V)) - sqrt(1/self$D))
      ubs <- c(numeric(ncol(V)) + sqrt(1/self$D))
      res <- super$preprocess_data(V, y, ubs, lbs, add.bias=FALSE,
                                   weights=weights,
                                   weights.upper.bound=weights.upper.bound)
    } else res <- super$preprocess_data(X, y, upper.bounds, lower.bounds,
                                        add.bias, weights, weights.upper.bound)

    X <- res$X
    y <- res$y
    upper.bounds <- res$upper.bounds
    lower.bounds <- res$lower.bounds

    # Process X
    p <- length(lower.bounds)
    tmp <- matrix(c(lower.bounds,upper.bounds),ncol=2)
    scale1 <- apply(abs(tmp),1,max)
    scale2 <- sqrt(p)
    X.norm <- t(t(X)/scale1)/scale2

    # Process y.
    # Labels must be 0 and 1
    if (any((y!=1) & (y!=0))) stop("y must be a numeric vector of binary labels
                                    0 and 1.")

    # Convert to 1 and -1 for SVM algorithm
    y[y==0] <- -1

    list(X=X.norm, y=y, scale1=scale1, scale2=scale2, weights=res$weights)
  },
  # description Postprocess coefficients obtained by fitting the model using
  #   differential privacy to ensure they match original inputs X and y.
  #   Effectively undoes the processing done in preprocess_data.
  # param coeff Vector of coefficients obtained by fitting the model.
  # param preprocess List of values returned by preprocess_data and used to
  #   process coeff.
  # return The processed coefficient vector.
  postprocess_coeff = function(coeff, preprocess){
    coeff/(preprocess$scale1*preprocess$scale2)
  }
))

#'Privacy-preserving Empirical Risk Minimization for Regression
#'
#'@description This class implements differentially private empirical risk
#'  minimization using the objective perturbation technique
#'  \insertCite{Kifer2012}{DPpack}. It is intended to be a framework for
#'  building more specific models via inheritance. See
#'  \code{\link{LinearRegressionDP}} for an example of this type of structure.
#'
#'@details To use this class for empirical risk minimization, first use the
#'  \code{new} method to construct an object of this class with the desired
#'  function values and hyperparameters. After constructing the object, the
#'  \code{fit} method can be applied with a provided dataset and data bounds to
#'  fit the model. In fitting, the model stores a vector of coefficients
#'  \code{coeff} which satisfy differential privacy. These can be released
#'  directly, or used in conjunction with the \code{predict} method to privately
#'  predict the outcomes of new datapoints.
#'
#'  Note that in order to guarantee differential privacy for the empirical risk
#'  minimization model, certain constraints must be satisfied for the values
#'  used to construct the object, as well as for the data used to fit.
#'  Specifically, the following constraints must be met. Let \eqn{l} represent
#'  the loss function for an individual dataset row x and output value y and
#'  \eqn{L} represent the average loss over all rows and output values. First,
#'  \eqn{L} must be convex with a continuous Hessian. Second, the l2-norm of the
#'  gradient of \eqn{l} must be bounded above by some constant zeta for all
#'  possible input values in the domain. Third, for all possible inputs to
#'  \eqn{l}, the Hessian of \eqn{l} must be of rank at most one and its
#'  Eigenvalues must be bounded above by some constant lambda. Fourth, the
#'  regularizer must be convex. Finally, the provided domain of \eqn{l} must be
#'  a closed convex subset of the set of all real-valued vectors of dimension p,
#'  where p is the number of columns of X. Note that because of this, a bias
#'  term cannot be included without appropriate scaling/preprocessing of the
#'  dataset. To ensure privacy, the add.bias argument in the \code{fit} and
#'  \code{predict} methods should only be utilized in subclasses within this
#'  package where appropriate preprocessing is implemented, not in this class.
#'
#'@keywords internal
#'
#'@references \insertRef{Kifer2012}{DPpack}
#'
#' @examples
#' # Build example dataset
#' n <- 500
#' X <- data.frame(X=seq(-1,1,length.out = n))
#' true.theta <- c(-.3,.5) # First element is bias term
#' p <- length(true.theta)
#' y <- true.theta[1] + as.matrix(X)%*%true.theta[2:p] + stats::rnorm(n=n,sd=.1)
#'
#' # Construct object for linear regression
#' mapXy <- function(X, coeff) X%*%coeff
#' loss <- function(y.hat, y) (y.hat-y)^2/2
#' regularizer <- 'l2' # Alternatively, function(coeff) coeff%*%coeff/2
#' eps <- 1
#' delta <- 1
#' domain <- list("constraints"=function(coeff) coeff%*%coeff-length(coeff),
#'   "jacobian"=function(coeff) 2*coeff)
#' # Set p to be the number of predictors desired including intercept term (length of coeff)
#' zeta <- 2*p^(3/2) # Proper bound for linear regression
#' lambda <- p # Proper bound for linear regression
#' gamma <- 1
#' mapXy.gr <- function(X, coeff) t(X)
#' loss.gr <- function(y.hat, y) y.hat-y
#' regularizer.gr <- function(coeff) coeff
#'
#' ermdp <- EmpiricalRiskMinimizationDP.KST$new(mapXy, loss, 'l2', eps, delta,
#'                                              domain, zeta, lambda,
#'                                              gamma, mapXy.gr, loss.gr,
#'                                              regularizer.gr)
#'
#' # Fit with data
#' # We must assume y is a matrix with values between -p and p (-2 and 2
#' #   for this example)
#' upper.bounds <- c(1, 2) # Bounds for X and y
#' lower.bounds <- c(-1,-2) # Bounds for X and y
#' ermdp$fit(X, y, upper.bounds, lower.bounds, add.bias=TRUE)
#' ermdp$coeff # Gets private coefficients
#'
#' # Predict new data points
#' # Build a test dataset
#' Xtest <- data.frame(X=c(-.5, -.25, .1, .4))
#' predicted.y <- ermdp$predict(Xtest, add.bias=TRUE)
#'
#'@export
EmpiricalRiskMinimizationDP.KST <- R6::R6Class("EmpiricalRiskMinimizationDP.KST",
  public=list(
  #' @field mapXy Map function of the form \code{mapXy(X, coeff)} mapping input
  #'   data matrix \code{X} and coefficient vector or matrix \code{coeff} to
  #'   output labels \code{y}.
  mapXy = NULL,
  #' @field mapXy.gr Function representing the gradient of the map function with
  #'   respect to the values in \code{coeff} and of the form \code{mapXy.gr(X,
  #'   coeff)}, where \code{X} is a matrix and \code{coeff} is a matrix or
  #'   numeric vector.
  mapXy.gr = NULL,
  #' @field loss Loss function of the form \code{loss(y.hat, y)}, where
  #'   \code{y.hat} and \code{y} are matrices.
  loss = NULL,
  #' @field loss.gr Function representing the gradient of the loss function with
  #'   respect to \code{y.hat} and of the form \code{loss.gr(y.hat, y)}, where
  #'   \code{y.hat} and \code{y} are matrices.
  loss.gr = NULL,
  #' @field regularizer Regularization function of the form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix.
  regularizer = NULL,
  #' @field regularizer.gr Function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}.
  regularizer.gr = NULL,
  #' @field gamma Nonnegative real number representing the regularization
  #'   constant.
  gamma = NULL,
  #' @field eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  eps = NULL,
  #' @field delta Nonnegative real number defining the delta privacy parameter.
  #'   If 0, reduces to pure eps-DP.
  delta = NULL,
  #' @field domain List of constraint and jacobian functions representing the
  #'   constraints on the search space for the objective perturbation algorithm
  #'   used in \insertCite{Kifer2012;textual}{DPpack}.
  domain = NULL,
  #' @field zeta Positive real number denoting the upper bound on the l2-norm
  #'   value of the gradient of the loss function, as required to ensure
  #'   differential privacy.
  zeta = NULL,
  #' @field lambda Positive real number corresponding to the upper bound of the
  #'   Eigenvalues of the Hessian of the loss function for all possible inputs.
  lambda = NULL,
  #' @field coeff Numeric vector of coefficients for the model.
  coeff = NULL,
  #' @description Create a new \code{EmpiricalRiskMinimizationDP.KST} object.
  #' @param mapXy Map function of the form \code{mapXy(X, coeff)} mapping input
  #'   data matrix \code{X} and coefficient vector or matrix \code{coeff} to
  #'   output labels \code{y}. Should return a column matrix of predicted labels
  #'   for each row of \code{X}. See \code{\link{mapXy.linear}} for an example.
  #' @param loss Loss function of the form \code{loss(y.hat, y)}, where
  #'   \code{y.hat} and \code{y} are matrices. Should be defined such that it
  #'   returns a matrix of loss values for each element of \code{y.hat} and
  #'   \code{y}. See \code{\link{loss.squared.error}} for an example. This
  #'   function must be convex and the l2-norm of its gradient must be bounded
  #'   above by zeta for some constant zeta for all possible inputs within the
  #'   given domain. Additionally, for all possible inputs within the given
  #'   domain, the Hessian of the loss function must be of rank at most one and
  #'   its Eigenvalues must be bounded above by some constant lambda.
  #' @param regularizer String or regularization function. If a string, must be
  #'   'l2', indicating to use l2 regularization. If a function, must have form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix, and
  #'   return the value of the regularizer at \code{coeff}. See
  #'   \code{\link{regularizer.l2}} for an example. Additionally, in order to
  #'   ensure differential privacy, the function must be convex.
  #' @param eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  #' @param delta Nonnegative real number defining the delta privacy parameter.
  #'   If 0, reduces to pure eps-DP.
  #' @param domain List of functions representing the constraints on the search
  #'   space for the objective perturbation algorithm. Must contain two
  #'   functions, labeled "constraints" and "jacobian", respectively. The
  #'   "constraints" function accepts a vector of coefficients from the search
  #'   space and returns a value such that the value is nonpositive if and only
  #'   if the input coefficient vector is within the constrained search space.
  #'   The "jacobian" function also accepts a vector of coefficients and returns
  #'   the Jacobian of the constraint function. For example, in linear
  #'   regression, the square of the l2-norm of the coefficient vector
  #'   \eqn{\theta} is assumed to be bounded above by p, where p is the length
  #'   of \eqn{\theta} \insertCite{Kifer2012}{DPpack}. So, domain could be
  #'   defined as `domain <- list("constraints"=function(coeff)
  #'   coeff%*%coeff-length(coeff), "jacobian"=function(coeff) 2*coeff)`.
  #' @param zeta Positive real number denoting the upper bound on the l2-norm
  #'   value of the gradient of the loss function, as required to ensure
  #'   differential privacy.
  #' @param lambda Positive real number corresponding to the upper bound of the
  #'   Eigenvalues of the Hessian of the loss function for all possible inputs.
  #' @param gamma Nonnegative real number representing the regularization
  #'   constant.
  #' @param mapXy.gr Optional function representing the gradient of the map
  #'   function with respect to the values in \code{coeff}. If given, must be of
  #'   the form \code{mapXy.gr(X, coeff)}, where \code{X} is a matrix and
  #'   \code{coeff} is a matrix or numeric vector. Should be defined such that
  #'   the ith row of the output represents the gradient with respect to the ith
  #'   coefficient. See \code{\link{mapXy.gr.linear}} for an example. If not
  #'   given, non-gradient based optimization methods are used to compute the
  #'   coefficient values in fitting the model.
  #' @param loss.gr Optional function representing the gradient of the loss
  #'   function with respect to \code{y.hat} and of the form
  #'   \code{loss.gr(y.hat, y)}, where \code{y.hat} and \code{y} are matrices.
  #'   Should be defined such that the ith row of the output represents the
  #'   gradient of the loss function at the ith set of input values. See
  #'   \code{\link{loss.gr.squared.error}} for an example. If not given,
  #'   non-gradient based optimization methods are used to compute the
  #'   coefficient values in fitting the model.
  #' @param regularizer.gr Optional function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}. Should return a vector. See
  #'   \code{\link{regularizer.gr.l2}} for an example. If \code{regularizer} is
  #'   given as a string, this value is ignored. If not given and
  #'   \code{regularizer} is a function, non-gradient based optimization methods
  #'   are used to compute the coefficient values in fitting the model.
  #'
  #' @return A new EmpiricalRiskMinimizationDP.KST object.
  initialize = function(mapXy, loss, regularizer, eps, delta, domain, zeta, lambda,
                        gamma, mapXy.gr=NULL, loss.gr=NULL, regularizer.gr=NULL){
    self$mapXy <- mapXy # Must be of form mapXy(X, coeff)
    self$mapXy.gr <- mapXy.gr
    self$loss <- loss # Must be of form loss(y.hat,y)
    self$loss.gr <- loss.gr
    if (is.character(regularizer)){
      if (regularizer == 'l2') {
        self$regularizer <- regularizer.l2
        self$regularizer.gr <- regularizer.gr.l2
      }
      else stop("regularizer must be one of {'l2'}, or a function.")
    }
    else {
      self$regularizer <- regularizer # Must be of form regularizer(coeff)
      self$regularizer.gr <- regularizer.gr
    }
    self$eps <- eps
    self$delta <- delta
    self$domain <- domain # of form list("constraints"=g(x) (<=0), "jacobian"=g'(x))
    self$zeta <- zeta
    self$lambda <- lambda
    self$gamma <- gamma
  },
  #' @description Fit the differentially private emprirical risk minimization
  #'   model. The function runs the objective perturbation algorithm
  #'   \insertCite{Kifer2012}{DPpack} to generate an objective function. A
  #'   numerical optimization method is then run to find optimal coefficients
  #'   for fitting the model given the training data and hyperparameters. The
  #'   \code{\link{nloptr}} function is used. If mapXy.gr, loss.gr, and
  #'   regularizer.gr are all given in the construction of the object, the
  #'   gradient of the objective function and the Jacobian of the constraint
  #'   function are utilized for the algorithm, and the NLOPT_LD_MMA method is
  #'   used. If one or more of these gradient functions are not given, the
  #'   NLOPT_LN_COBYLA method is used. The resulting privacy-preserving
  #'   coefficients are stored in coeff.
  #' @param X Dataframe of data to be fit.
  #' @param y Vector or matrix of true values for each row of \code{X}.
  #' @param upper.bounds Numeric vector of length \code{ncol(X)+1} giving upper
  #'   bounds on the values in each column of \code{X} and the values of
  #'   \code{y}. The last value in the vector is assumed to be the upper bound
  #'   on \code{y}, while the first \code{ncol(X)} values are assumed to be in
  #'   the same order as the corresponding columns of \code{X}. Any value in the
  #'   columns of \code{X} and in \code{y} larger than the corresponding upper
  #'   bound is clipped at the bound.
  #' @param lower.bounds Numeric vector of length \code{ncol(X)+1} giving lower
  #'   bounds on the values in each column of \code{X} and the values of
  #'   \code{y}. The last value in the vector is assumed to be the lower bound
  #'   on \code{y}, while the first \code{ncol(X)} values are assumed to be in
  #'   the same order as the corresponding columns of \code{X}. Any value in the
  #'   columns of \code{X} and in \code{y} larger than the corresponding lower
  #'   bound is clipped at the bound.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE.
  fit = function(X, y, upper.bounds, lower.bounds, add.bias=FALSE){
    # Assumptions:
    # If assumptions are not met in child class, implement
    #     preprocess/postprocess data functions so they are
    preprocess <- private$preprocess_data(X,y,upper.bounds,lower.bounds,add.bias)
    X <- preprocess$X
    y <- preprocess$y

    n <- length(y)
    p <- ncol(X)
    if (is.null(self$eps) || is.infinite(self$eps)) Delta <- 0
    else Delta <- 2*self$lambda/self$eps
    if (self$delta==0){
      norm.b <- rgamma(1, p, rate=self$eps/(2*self$zeta))
      direction.b <- stats::rnorm(p)
      direction.b <- direction.b/sqrt(sum(direction.b^2))
      b <- norm.b*direction.b
    } else{
      mu <- numeric(p)
      Sigma <- ((self$zeta^2*(8*log(2/self$delta)+4*self$eps))/
                  (self$eps^2))*diag(p)
      b <- MASS::mvrnorm(n=1,mu,Sigma)
    }
    if (is.null(self$eps) || is.infinite(self$eps)) b <- numeric(p)
    b <- as.matrix(b)

   tmp.coeff <- private$optimize_coeff(X, y, Delta, b)
   self$coeff <- private$postprocess_coeff(tmp.coeff, preprocess)
   invisible(self)
  },
  #' @description Predict y values for given X using the fitted coefficients.
  #' @param X Dataframe of data on which to make predictions. Must be of same
  #'   form as \code{X} used to fit coefficients.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE. If add.bias was set to TRUE when fitting the
  #'   coefficients, add.bias should be set to TRUE for predictions.
  #'
  #' @return Matrix of predicted y values corresponding to each row of X.
  predict = function(X, add.bias=FALSE){
    if (add.bias){
      X <- dplyr::mutate(X, bias =1)
      X <- X[, c(ncol(X), 1:(ncol(X)-1))]
    }
    self$mapXy(as.matrix(X), self$coeff)
  }
), private=list(
  # description Preprocess input data and bounds to ensure they meet the
  #   assumptions necessary for fitting the model. If desired, a bias term can
  #   also be added.
  # param X Dataframe of data to be fit. Will be converted to a matrix.
  # param y Vector or matrix of true values for each row of X. Will be
  #   converted to a matrix.
  # param upper.bounds Numeric vector of length ncol(X)+1 giving upper bounds on
  #   the values in each column of X and the values in y. The last value in the
  #   vector is assumed to be the upper bound on y, while the first ncol(X)
  #   values are assumed to be in the same order as the corresponding columns of
  #   X. Any value in the columns of X and y larger than the corresponding
  #   upper bound is clipped at the bound.
  # param lower.bounds Numeric vector of length ncol(X)+1 giving lower bounds on
  #   the values in each column of X and the values in y. The last value in the
  #   vector is assumed to be the lower bound on y, while the first ncol(X)
  #   values are assumed to be in the same order as the corresponding columns of
  #   X. Any value in the columns of X and y smaller than the corresponding
  #   lower bound is clipped at the bound.
  # param add.bias Boolean indicating whether to add a bias term to X.
  # return A list of preprocessed values for X, y, upper.bounds, and
  #   lower.bounds for use in the privacy-preserving empirical risk
  #   minimization algorithm.
  preprocess_data = function(X, y, upper.bounds, lower.bounds, add.bias){
    # Make sure values are correct
    if (length(upper.bounds)!=(ncol(X)+1)){
      stop("Length of upper.bounds must be equal to the number of columns of X plus 1 (for y).");
    }
    if (length(lower.bounds)!=(ncol(X)+1)) {
      stop("Length of lower.bounds must be equal to the number of columns of X plus 1 (for y).");
    }

    # Add bias if needed
    if (add.bias){
      X <- dplyr::mutate(X, bias =1)
      X <- X[, c(ncol(X), 1:(ncol(X)-1))]
      upper.bounds <- c(1,upper.bounds)
      lower.bounds <- c(1,lower.bounds)
    }

    # Make matrices for multiplication purposes
    X <- as.matrix(X,nrow=length(y))
    y <- as.matrix(y,ncol=1)

    # Clip based on provided bounds
    for (i in 1:(length(upper.bounds)-1)){
      X[X[,i]>upper.bounds[i],i] <- upper.bounds[i]
      X[X[,i]<lower.bounds[i],i] <- lower.bounds[i]
    }
    y[y>upper.bounds[length(upper.bounds)]] <- upper.bounds[length(upper.bounds)]
    y[y<lower.bounds[length(lower.bounds)]] <- lower.bounds[length(lower.bounds)]
    list(X=X, y=y, upper.bounds=upper.bounds, lower.bounds=lower.bounds)
  },
  # description Postprocess coefficients obtained by fitting the model using
  #   differential privacy to ensure they match original inputs X and y.
  #   Effectively undoes the processing done in preprocess_data.
  # param coeff Vector of coefficients obtained by fitting the model.
  # param preprocess List of values returned by preprocess_data and used to
  #   process coeff.
  # return The processed coefficient vector.
  postprocess_coeff = function(coeff, preprocess){
    coeff
  },
  # description Run numerical optimization method to find optimal coefficients
  #   for fitting model given training data and hyperparameters. This function
  #   builds the objective function based on the training data and the provided
  #   search space domain, and runs the constrained optimization algorithm
  #   nloptr from the nloptr package. If mapXy.gr, loss.gr, and regularizer.gr are
  #   all given in the construction of the object, the gradient of the objective
  #   function and the Jacobian of the constraint function are utilized for the
  #   algorithm, and the NLOPT_LD_MMA method is used. If one or more of these
  #   gradient functions are not given, the NLOPT_LN_COBYLA method is used.
  # param X Matrix of data to be fit.
  # param y Vector or matrix of true labels for each row of X.
  # param Delta Nonnegative real number set by the differentially private
  #   algorithm and required for constructing the objective function.
  # param b Numeric perturbation vector randomly drawn by the differentially
  #   private algorithm and required for constructing the objective function.
  # return Vector of fitted coefficients.
  optimize_coeff=function(X, y, Delta, b){
    n <- length(y)
    p <- ncol(X)

    # Get objective function
    objective <- function(par, X, y, Delta, b){
      as.numeric(sum(self$loss(self$mapXy(X,par),y))/n +
                   self$gamma*self$regularizer(par)/n + Delta*par%*%par/(2*n) +
                   t(b)%*%par/n)
    }

    g <- function(par, X, y, Delta, b) self$domain$constraints(par) # For new opt method

    # Get gradient function
    if (!is.null(self$mapXy.gr) && !is.null(self$loss.gr) &&
        !is.null(self$regularizer.gr)) {
      objective.gr <- function(par, X, y, Delta, b){
        as.numeric(self$mapXy.gr(X,par)%*%self$loss.gr(self$mapXy(X,par),y)/n +
                     self$gamma*self$regularizer.gr(par)/n + b/n + Delta*par/n)
      }
      alg <- "NLOPT_LD_MMA" # For new opt method
      g_jac <- function(par, X, y, Delta, b) self$domain$jacobian(par) # For new opt method
    }
    else {
      objective.gr <- NULL
      alg <- "NLOPT_LN_COBYLA" # For new opt method
      g_jac <- NULL # For new opt method
    }

    # Run optimization
    coeff0 <- numeric(ncol(X))

    # For old opt method
    # opt.res <- optim(coeff0, fn=objective, gr=objective.gr, method="CG",
    #                  X=X, y=y, Delta=Delta, b=b)
    # opt.res$par

    # For new opt method
    opts <- list("algorithm"=alg, xtol_rel = 1e-4)
    opt.res <- nloptr::nloptr(x0=coeff0, eval_f=objective,
                              eval_grad_f=objective.gr, eval_g_ineq=g,
                              eval_jac_g_ineq=g_jac, opts=opts,
                              X=X, y=y, Delta=Delta, b=b)
    opt.res$solution
  }
))

#' Privacy-preserving Linear Regression
#'
#' @description This class implements differentially private linear regression
#'   using the objective perturbation technique \insertCite{Kifer2012}{DPpack}.
#'
#' @details To use this class for linear regression, first use the \code{new}
#'   method to construct an object of this class with the desired function
#'   values and hyperparameters. After constructing the object, the \code{fit}
#'   method can be applied with a provided dataset and data bounds to fit the
#'   model. In fitting, the model stores a vector of coefficients \code{coeff}
#'   which satisfy differential privacy. These can be released directly, or used
#'   in conjunction with the \code{predict} method to privately predict the
#'   outcomes of new datapoints.
#'
#'   Note that in order to guarantee differential privacy for linear regression,
#'   certain constraints must be satisfied for the values used to construct the
#'   object, as well as for the data used to fit. The regularizer must be
#'   convex. Additionally, it is assumed that if x represents a single row of
#'   the dataset X, then the l2-norm of x is at most p for all x, where p is the
#'   number of predictors (including any possible intercept term). In order to
#'   ensure this constraint is satisfied, the dataset is preprocessed and
#'   scaled, and the resulting coefficients are postprocessed and un-scaled so
#'   that the stored coefficients correspond to the original data. Due to this
#'   constraint on x, it is best to avoid using an intercept term in the model
#'   whenever possible. If an intercept term must be used, the issue can be
#'   partially circumvented by adding a constant column to X before fitting the
#'   model, which will be scaled along with the rest of X. The \code{fit} method
#'   contains functionality to add a column of constant 1s to X before scaling,
#'   if desired.
#'
#' @references \insertRef{Kifer2012}{DPpack}
#'
#' @examples
#' # Build example dataset
#' n <- 500
#' X <- data.frame(X=seq(-1,1,length.out = n))
#' true.theta <- c(-.3,.5) # First element is bias term
#' p <- length(true.theta)
#' y <- true.theta[1] + as.matrix(X)%*%true.theta[2:p] + stats::rnorm(n=n,sd=.1)
#'
#' # Construct object for linear regression
#' regularizer <- 'l2' # Alternatively, function(coeff) coeff%*%coeff/2
#' eps <- 1
#' delta <- 0 # Indicates to use pure eps-DP
#' gamma <- 1
#' regularizer.gr <- function(coeff) coeff
#'
#' lrdp <- LinearRegressionDP$new('l2', eps, delta, gamma, regularizer.gr)
#'
#' # Fit with data
#' # We must assume y is a matrix with values between -p and p (-2 and 2
#' #   for this example)
#' upper.bounds <- c(1, 2) # Bounds for X and y
#' lower.bounds <- c(-1,-2) # Bounds for X and y
#' lrdp$fit(X, y, upper.bounds, lower.bounds, add.bias=TRUE)
#' lrdp$coeff # Gets private coefficients
#'
#' # Predict new data points
#' # Build a test dataset
#' Xtest <- data.frame(X=c(-.5, -.25, .1, .4))
#' predicted.y <- lrdp$predict(Xtest, add.bias=TRUE)
#'
#' @export
LinearRegressionDP <- R6::R6Class("LinearRegressionDP",
  inherit=EmpiricalRiskMinimizationDP.KST,
  public=list(
  #' @description Create a new LinearRegressionDP object.
  #' @param regularizer String or regularization function. If a string, must be
  #'   'l2', indicating to use l2 regularization. If a function, must have form
  #'   \code{regularizer(coeff)}, where \code{coeff} is a vector or matrix, and
  #'   return the value of the regularizer at \code{coeff}. See
  #'   \code{\link{regularizer.l2}} for an example. Additionally, in order to
  #'   ensure differential privacy, the function must be convex.
  #' @param eps Positive real number defining the epsilon privacy budget. If set
  #'   to Inf, runs algorithm without differential privacy.
  #' @param delta Nonnegative real number defining the delta privacy parameter.
  #'   If 0, reduces to pure eps-DP.
  #' @param gamma Nonnegative real number representing the regularization
  #'   constant.
  #' @param regularizer.gr Optional function representing the gradient of the
  #'   regularization function with respect to \code{coeff} and of the form
  #'   \code{regularizer.gr(coeff)}. Should return a vector. See
  #'   \code{\link{regularizer.gr.l2}} for an example. If \code{regularizer} is
  #'   given as a string, this value is ignored. If not given and
  #'   \code{regularizer} is a function, non-gradient based optimization methods
  #'   are used to compute the coefficient values in fitting the model.
  #'
  #' @return A new LinearRegressionDP object.
  initialize = function(regularizer, eps, delta, gamma, regularizer.gr=NULL){
      domain.linear <- list("constraints"=function(coeff) coeff%*%coeff-length(coeff),
                            "jacobian"=function(coeff) 2*coeff)
      super$initialize(mapXy.linear, loss.squared.error, regularizer, eps, delta,
                       domain.linear, NULL, NULL, gamma,
                       mapXy.gr=mapXy.gr.linear, loss.gr=loss.gr.squared.error,
                       regularizer.gr=regularizer.gr)
    },
  #' @description Fit the differentially private linear regression model. The
  #'   function runs the objective perturbation algorithm
  #'   \insertCite{Kifer2012}{DPpack} to generate an objective function. A
  #'   numerical optimization method is then run to find optimal coefficients
  #'   for fitting the model given the training data and hyperparameters. The
  #'   \code{\link{nloptr}} function is used. If \code{regularizer} is given as
  #'   'l2' or if \code{regularizer.gr} is given in the construction of the
  #'   object, the gradient of the objective function and the Jacobian of the
  #'   constraint function are utilized for the algorithm, and the NLOPT_LD_MMA
  #'   method is used. If this is not the case, the NLOPT_LN_COBYLA method is
  #'   used. The resulting privacy-preserving coefficients are stored in coeff.
  #' @param X Dataframe of data to be fit.
  #' @param y Vector or matrix of true values for each row of \code{X}.
  #' @param upper.bounds Numeric vector of length \code{ncol(X)+1} giving upper
  #'   bounds on the values in each column of \code{X} and the values of
  #'   \code{y}. The last value in the vector is assumed to be the upper bound
  #'   on \code{y}, while the first \code{ncol(X)} values are assumed to be in
  #'   the same order as the corresponding columns of \code{X}. Any value in the
  #'   columns of \code{X} and in \code{y} larger than the corresponding upper
  #'   bound is clipped at the bound.
  #' @param lower.bounds Numeric vector of length \code{ncol(X)+1} giving lower
  #'   bounds on the values in each column of \code{X} and the values of
  #'   \code{y}. The last value in the vector is assumed to be the lower bound
  #'   on \code{y}, while the first \code{ncol(X)} values are assumed to be in
  #'   the same order as the corresponding columns of \code{X}. Any value in the
  #'   columns of \code{X} and in \code{y} larger than the corresponding lower
  #'   bound is clipped at the bound.
  #' @param add.bias Boolean indicating whether to add a bias term to \code{X}.
  #'   Defaults to FALSE.
  fit = function(X, y, upper.bounds, lower.bounds, add.bias=FALSE){
    super$fit(X,y,upper.bounds,lower.bounds,add.bias)
  }
), private=list(
  # description Preprocess input data and bounds to ensure they meet the
  #   assumptions necessary for fitting the model. If desired, a bias term can
  #   also be added.
  # param X Dataframe of data to be fit. Will be converted to a matrix.
  # param y Vector or matrix of true values for each row of X. Will be
  #   converted to a matrix.
  # param upper.bounds Numeric vector of length ncol(X)+1 giving upper bounds on
  #   the values in each column of X and the values in y. The last value in the
  #   vector is assumed to be the upper bound on y, while the first ncol(X)
  #   values are assumed to be in the same order as the corresponding columns of
  #   X. Any value in the columns of X and y larger than the corresponding
  #   upper bound is clipped at the bound.
  # param lower.bounds Numeric vector of length ncol(X)+1 giving lower bounds on
  #   the values in each column of X and the values in y. The last value in the
  #   vector is assumed to be the lower bound on y, while the first ncol(X)
  #   values are assumed to be in the same order as the corresponding columns of
  #   X. Any value in the columns of X and y smaller than the corresponding
  #   lower bound is clipped at the bound.
  # param add.bias Boolean indicating whether to add a bias term to X.
  # return A list of preprocessed values for X, y, upper.bounds, and
  #   lower.bounds for use in the privacy-preserving empirical risk
  #   minimization algorithm.
  preprocess_data=function(X, y, upper.bounds, lower.bounds, add.bias){
    res <- super$preprocess_data(X,y,upper.bounds,lower.bounds, add.bias)
    X <- res$X
    y <- res$y
    upper.bounds <- res$upper.bounds
    lower.bounds <- res$lower.bounds

    # Set necessary values for linear regression
    p <- ncol(X)
    self$zeta <- 2*p^(3/2)
    self$lambda <- p

    # Process X
    lb <- lower.bounds[1:(length(lower.bounds)-1)]
    ub <- upper.bounds[1:(length(upper.bounds)-1)]
    tmp <- matrix(c(lb,ub),ncol=2)
    scale.X <- apply(abs(tmp),1,max)
    # Need each row at most norm sqrt(p), already achieved by first scaling
    X.norm <- t(t(X)/scale.X)

    # Process y
    lb.y <- lower.bounds[length(lower.bounds)]
    ub.y <- upper.bounds[length(upper.bounds)]
    if (add.bias) {
      shift.y <- lb.y + (ub.y - lb.y)/2 # Subtracting this centers at 0
      scale.y <- (ub.y-lb.y)/(2*p) # Dividing by this scales to max size p in abs()
    }
    else {
      # Cannot have shift term if no bias term to re-absorb it in post-processing
      shift.y <- 0
      # Dividing by this scales to max size p in abs()
      scale.y <- max(abs(c(lb.y, ub.y)))/p
    }

    y.norm <- (y-shift.y)/scale.y

    list(X=X.norm,y=y.norm, scale.X=scale.X, shift.y=shift.y, scale.y=scale.y)
  },
  # description Postprocess coefficients obtained by fitting the model using
  #   differential privacy to ensure they match original inputs X and y.
  #   Effectively undoes the processing done in preprocess_data.
  # param coeff Vector of coefficients obtained by fitting the model.
  # param preprocess List of values returned by preprocess_data and used to
  #   process coeff.
  # return The processed coefficient vector.
  postprocess_coeff=function(coeff, preprocess){
    # Undo X.norm
    coeff <- coeff/preprocess$scale.X

    # Undo y.norm
    coeff <- coeff*preprocess$scale.y
    coeff[1] <- coeff[1] + preprocess$shift.y # Assumes coeff[1] is bias term
    coeff
  }
))

#' Privacy-preserving Hyperparameter Tuning for Linear Regression Models
#'
#' This function implements the privacy-preserving hyperparameter tuning
#' function for linear regression \insertCite{Kifer2012}{DPpack} using the
#' exponential mechanism. It accepts a list of models with various chosen
#' hyperparameters, a dataset X with corresponding values y, upper and lower
#' bounds on the columns of X and the values of y, and a boolean indicating
#' whether to add bias in the construction of each of the models. The data are
#' split into m+1 equal groups, where m is the number of models being compared.
#' One group is set aside as the validation group, and each of the other m
#' groups are used to train each of the given m models. The negative of the sum
#' of the squared error for each model on the validation set is used as the
#' utility values in the exponential mechanism
#' (\code{\link{ExponentialMechanism}}) to select a tuned model in a
#' privacy-preserving way.
#'
#' @param models Vector of linear regression model objects, each initialized
#'   with a different combination of hyperparameter values from the search space
#'   for tuning. Each model should be initialized with the same epsilon privacy
#'   parameter value eps. The tuned model satisfies eps-level differential
#'   privacy.
#' @param X Dataframe of data to be used in tuning the model. Note it is assumed
#'   the data rows and corresponding labels are randomly shuffled.
#' @param y Vector or matrix of true values for each row of X.
#' @param upper.bounds Numeric vector giving upper bounds on the values in each
#'   column of X and the values in y. Should be length ncol(X)+1. The first
#'   ncol(X) values are assumed to be in the same order as the corresponding
#'   columns of X, while the last value in the vector is assumed to be the upper
#'   bound on y. Any value in the columns of X and y larger than the
#'   corresponding upper bound is clipped at the bound.
#' @param lower.bounds Numeric vector giving lower bounds on the values in each
#'   column of X and the values in y. Should be length ncol(X)+1. The first
#'   ncol(X) values are assumed to be in the same order as the corresponding
#'   columns of X, while the last value in the vector is assumed to be the lower
#'   bound on y. Any value in the columns of X and y smaller than the
#'   corresponding lower bound is clipped at the bound.
#' @param add.bias Boolean indicating whether to add a bias term to X. Defaults
#'   to FALSE.
#' @return Single model object selected from the input list models with tuned
#'   parameters.
#' @examples
#' # Build example dataset
#' n <- 500
#' X <- data.frame(X=seq(-1,1,length.out = n))
#' true.theta <- c(-.3,.5) # First element is bias term
#' p <- length(true.theta)
#' y <- true.theta[1] + as.matrix(X)%*%true.theta[2:p] + stats::rnorm(n=n,sd=.1)
#'
#' # Grid of possible gamma values for tuning linear regression model
#' grid.search <- c(100, 1, .0001)
#'
#' # Construct objects for logistic regression parameter tuning
#' # Privacy budget should be the same for all models
#' eps <- 1
#' delta <- 0.01
#' linrdp1 <- LinearRegressionDP$new("l2", eps, delta, grid.search[1])
#' linrdp2 <- LinearRegressionDP$new("l2", eps, delta, grid.search[2])
#' linrdp3 <- LinearRegressionDP$new("l2", eps, delta, grid.search[3])
#' models <- c(linrdp1, linrdp2, linrdp3)
#'
#' # Tune using data and bounds for X and y based on their construction
#' upper.bounds <- c( 1, 2) # Bounds for X and y
#' lower.bounds <- c(-1,-2) # Bounds for X and y
#' tuned.model <- tune_linear_regression_model(models, X, y, upper.bounds,
#'                                             lower.bounds, add.bias=TRUE)
#' tuned.model$gamma # Gives resulting selected hyperparameter
#'
#' # tuned.model result can be used the same as a trained LogisticRegressionDP model
#' tuned.model$coeff # Gives coefficients for tuned model
#'
#' # Build a test dataset for prediction
#' Xtest <- data.frame(X=c(-.5, -.25, .1, .4))
#' predicted.y <- tuned.model$predict(Xtest, add.bias=TRUE)
#'
#' @references \insertRef{Kifer2012}{DPpack}
#'
#' @export
tune_linear_regression_model<- function(models, X, y, upper.bounds, lower.bounds,
                                     add.bias=FALSE){
  # Make sure values are correct
  m <- length(models)
  n <- length(y)
  # Split data into m+1 groups
  validateX <- X[seq(m+1,n,m+1),,drop=FALSE]
  validatey <- y[seq(m+1,n,m+1)]
  z <- numeric(m)
  for (i in 1:m){
    subX <- X[seq(i,n,m+1),,drop=FALSE]
    suby <- y[seq(i,n,m+1)]
    models[[i]]$fit(subX,suby,upper.bounds,lower.bounds,add.bias)
    validatey.hat <- models[[i]]$predict(validateX,add.bias)
    z[i] <- sum((validatey - validatey.hat)^2)
  }
  # Bounds for y give global sensitivity for exponential mechanism in linear
  #    regression case. Sensitivity can be computed as square of difference
  #    between upper and lower bounds.
  ub.y <- upper.bounds[length(upper.bounds)]
  lb.y <- lower.bounds[length(lower.bounds)]
  res <- ExponentialMechanism(-z, models[[1]]$eps, (ub.y-lb.y)^2,
                              candidates=models)
  res[[1]]
}

Try the DPpack package in your browser

Any scripts or data that you put into this service are public.

DPpack documentation built on April 8, 2023, 9:09 a.m.