# R/workHorse.R In penplaqr/penplaqr:

```#' Check function.
#'
#' \code{check} computes the "check" loss function used in quantile regression.
#'
#' @param tau a numeric-valued scalar between 0 and 1.
#' @param x a numeric-valued scalar or vector.
#'
#' @return a numeric or vector of numerics of length \code{length(x)}.
check <- function(tau, x){

x*(tau - (x < 0))

}

#' High-dimensional BIC.
#'
#' \code{QBIC} computes the high-dimensional BIC described in Sherwood and Wang (2016).
#'
#' @param lambda a nonnegative scalar; the tuning parameter.
#' @param resids a vector of residuals from the candidate model.
#' @param p_linear the number of candidate linear covariates.
#' @param degree the degrees of freedom of the candidate model.
#'
#' @export
QBIC <- function(tau, resids, p_linear, degfree){

n <- length(resids)

log(sum(check(tau, resids))) + degfree*log(p_linear)*log(log(n))/(2*n)

}

#' Augmented Data.
#'
#' \code{augment_data} creates an augmented dataset for use in the weighted
#' unpenalized quantile regression as part of the algorithm described in
#' Sherwood and Wang (2016) for solving the penalized quantile regression problem.
#'
#' @param n_linvars the number of predictors modeled with linear terms
#' @param n_nonlinvars the number of predictors modeled non-linearly
augment_data <- function(n_linvars, n_nonlinvars){

extraobs_firstset <- extraobs_secondset <-  matrix(0, nrow = n_linvars, ncol = (1 + n_linvars + n_nonlinvars))
extraobs_firstset[, 2:(n_linvars + 1)] <- diag(1, n_linvars, n_linvars)

extraobs_secondset[, 2:(n_linvars + 1)] <- diag(-1, n_linvars, n_linvars)

as.data.frame(rbind(extraobs_firstset, extraobs_secondset))

}

#' Penalty weights.
#'
#' \code{penalty_weights} calculates the weights associated with the
#' observations in the augmented dataset used in fitting the penalized partially
#' linear quantile regression.
#' @param beta_vec a numeric vector containing the current estimates of the
#' regression parameters from the linear portion of the model.
#' @param n the number of observations in the data set.
#' @param penalty_type a character string indicating which penalty function
#' @param penalty_deriv a user-specified function for use in creating the weights.
#' @param lambda a scalar corresponding to the tuning parameter.
#' @param a a scalar for use in the SCAD and MCP penalty functions.
#' @param ... (optional) additional arguments to be passed to the user-supplied penalty derivative function
#'
#' @details The weights are based on the derivative of the penalty function
#' used in defining the penalized regression problem. There are two ways for specifying
#' the derivative of the penalty function. First, the user can provide the name of the penalty
#' function through the argument penalty_type. Based off this specification, the derivative is then
#' computed using built-in functions. The current options for the penalty function are
#' "SCAD" and "MCP". Alternatively, the user can provide their own
#' penalty derivative function. This function should take as an argument a numeric vector
#' and return a numeric vector of the same length.
#'
#'
#' @return a vector of length (n + 2*length(beta_vec)) containing the weights associated
#' with the observations in the augmented data set.
penalty_weights <- function(beta_vec, n,  penalty_type = NULL, penalty_deriv = NULL, lambda, a = NULL, ... ){

if(is.null(penalty_type) & is.null(penalty_deriv)) stop("You must either specify a penalty function or provide a penalty derivative")

if(!(penalty_type %in% c("LASSO", "SCAD", "MCP"))) stop("The penalty function you specified is not supported")

if(penalty_type == "MCP")  deriv_vals <- rqPen::mcp_deriv(beta_vec, lambda, a)

if(!is.null(penalty_deriv)) deriv_vals <- penalty_deriv(beta_vec, lambda, ...)

c(rep(1, n), deriv_vals, deriv_vals)

}

#' Partially linear penalized quantile regression
#'
#' \code{penplaqr} fits a partially linear penalized quantile regression.
#' @param formula a formula object containing the response variable on the LHS and the variables to be modeled with
#' linear terms on the RHS.
#' @param nonlinVars a formula object with an empty LHS and the variables to be modeled non-linearly on the RHS.
#' @param tau a scalar indicating the desired quantile.
#' @param lambda a scalar indicating the value of the tuning parameter.
#' @param penalty a character string indicating which penalty should be used in fitting the model.
#' @param penalty_deriv an optional user-specified function for computing the derivative of the penalty function.
#' @param a additional tuning paramater for certain penalty functions.
#' @param epsilon scalar value for determining when convergence has been reached.
#' @param data a data frame
#' @import quantreg
#' @import splines
#' @export
penplaqr <- function(formula, nonlinVars = NULL, tau = .5, lambda = NULL, penalty = "SCAD", penalty_deriv = NULL, a = NULL, data = NULL, subset,
na.action, method = c("br", "fn"), model = TRUE, contrasts = NULL, init_beta = NULL,
splinesettings = NULL, epsilon = 1e-6, ...){
if(length(tau)>1) stop("tau must be a number (not a vector) strictly between 0 and 1.")
if(class(nonlinVars)!="formula" & !is.null(nonlinVars)){
stop("nonlinVars must be of class \"formula\" or NULL. NULL by default.\n")
}
penplaqrcall <- match.call()
rqcall <- penplaqrcall
nonlinvars <- NULL
linvars <- attr(terms(formula, data=data), "term.labels")

# creating spline terms for non-linear portion of the model
if(is.null(nonlinVars)){
int <- ifelse(attr(terms(formula, data=data),"intercept")==1, "1", "0")
rqcall\$formula <- update(formula, paste(c("~",linvars,int),
collapse="+"))
} else {
nonlinvars <- attr(terms(nonlinVars, data=data), "term.labels")
nonlinvars <- nonlinvars[!(nonlinvars %in% all.vars(formula))]
linvars <- linvars[!(linvars %in% nonlinvars)]
nonlinvarsbs <- rep(NA, length(nonlinvars))
if(length(splinesettings)!=length(nonlinvarsbs) && !is.null(splinesettings))
stop("splinesettings must either be NULL or a list with length = number of nonlinear covariates \n (see details ?plaqr)")

bslist <- vector("list", length(nonlinvars))
for( i in 1:length(nonlinvars) ){
bslist[[i]] <- paste("bs(",nonlinvars[i], sep="")

if( !is.null(splinesettings[[i]]\$df) )
bslist[[i]] <- paste(bslist[[i]], ", df=",splinesettings[[i]]\$df, sep="")
if( !is.null(splinesettings[[i]]\$knots) )
bslist[[i]] <- paste(bslist[[i]], ", knots=c(",
toString(splinesettings[[i]]\$knots), ")", sep="")
if( !is.null(splinesettings[[i]]\$degree) )
bslist[[i]] <- paste(bslist[[i]], ", degree=",splinesettings[[i]]\$degree, sep="")
if( !is.null(splinesettings[[i]]\$Boundary.knots) )
bslist[[i]] <- paste(bslist[[i]], ", Boundary.knots=c(",
toString(splinesettings[[i]]\$Boundary.knots), ")", sep="")

bslist[[i]] <- paste(bslist[[i]], ")", sep="")
}

nonlinvarsbs <- do.call(c, bslist)
rqcall\$formula <- update(formula, paste(c("~","1",linvars,nonlinvarsbs),
collapse="+"))
}

# creating augmented dataset
linear_terms <- attr(terms(formula, data = data), "term.labels")

nonlinear_terms <- attr(terms(nonlinVars, data = data), "term.labels")

response_var <- all.vars(update(formula, .~1))

# removing variables included in non-linear terms from set of
# linear terms
linear_terms <- setdiff(linear_terms, nonlinear_terms)

temp_dat <- data[,c(response_var, linear_terms, nonlinear_terms)]

extra_observations <- augment_data(length(linear_terms), length(nonlinear_terms))

names(extra_observations) <- names(temp_dat)

augmented_data <- rbind(temp_dat, extra_observations)

# calculating weights at initial value for beta
if(is.null(init_beta)) init_beta <- rep(0, length(linear_terms))

init_weights <- penalty_weights(abs(init_beta), n = nrow(data),
penalty_type = penalty, penalty_deriv = penalty_deriv, lambda = lambda, a = a)

# adding augmented data and weights to rq call
rqcall[] <- as.name("rq")
rqcall\$nonlinVars <- rqcall\$splinesettings <- rqcall\$lambda <- rqcall\$penalty <- rqcall\$epsilon <-  NULL
rqcall\$data <- augmented_data
rqcall\$method <- method
rqcall\$weights <- init_weights
rqcall\$a <- NULL
# running first iteration
first_fit <- eval.parent(rqcall)
class(first_fit) <- "rq"

current_beta <- coef(first_fit)[linear_terms]
prev_beta <- init_beta
iter <- 1

# iterating until convergence
while(sum((current_beta - prev_beta)^2) > epsilon){

prev_beta <- current_beta

current_weights <- penalty_weights(abs(prev_beta), n = nrow(data),
penalty_type = penalty, penalty_deriv = penalty_deriv, lambda = lambda, a = a)

rqcall\$weights <- current_weights
current_fit <- eval.parent(rqcall)
class(current_fit) <- "rq"

current_beta <- coef(current_fit)[linear_terms]

iter <- iter + 1

}

if(iter == 1) return(first_fit)
if(iter > 1) return(current_fit)

}

#' Average Number of False Variables.
#'
#' Given coefficient estimates from a set of simulations, \code{fv} calculates the average number of
#' zero-valued coefficients that are incorrectly selected to be in the model.
#'
#' @param model_coefficients a vector or matrix containing the coefficient estimates. If a matrix,
#' then the coefficients should vary by row and the simulations should vary by column.
#' @param true_coefficients a vector of indices of the true non-zero coefficients
#' @param threshold a scalar threshold below which coefficients should be set to zero
#'
#' @export
#'
#' @return a scalar indicating the average number of zero-valued coefficients incorrectly selected to be in the model
fv <- function(model_coefficients, true_coefficients, threshold){

# checking if coefficients from just a single simulation have been supplied,
# in which case the average is just the number of false variables
if(class(model_coefficients == "numeric") || (class(model_coefficients == "matrix") && ncol(model_coefficients == 1))){

sum(model_coefficients[-true_coefficients] > threshold)

} else{

mean(colSums(model_coefficients[-true_coefficients, ] > threshold))

}
}

#' Average Number of True Variables
#'
#' Given coefficient estimates from a set of simulations, \code{tv} calculates the average number of
#' non-zero valued coefficients that are correctly selected to be in the model.
#'
#' @param model_coefficients a vector or matrix containing the coefficient estimates. If a matrix,
#' then the coefficients should vary by row and the simulations should vary by column.
#' @param true_coefficients a vector of indices of the true non-zero coefficients
#' @param threshold a scalar threshold below which coefficients should be set to zero
#'
#' @export
#'
#' @return a scalar indicating the average number of non-zero valued coefficients correctly selected to be in the model
fv <- function(model_coefficients, true_coefficients, threshold){

# checking if coefficients from just a single simulation have been supplied,
# in which case the average is just the number of false variables
if(class(model_coefficients == "numeric") || (class(model_coefficients == "matrix") && ncol(model_coefficients == 1))){

sum(model_coefficients[true_coefficients] > threshold)

} else{

mean(colSums(model_coefficients[true_coefficients, ] > threshold))

}
}

#' Probability of Selecting the True Model.
#'
#' Given coefficient estimates from a set of simulations, \code{prob_true}
#' calculates the proportion of simulations that selected the true model exactly.
#'
#' @param model_coefficients a vector or matrix of coefficient estimates. If a matrix,
#' then the coefficients should vary by row and the simulations should vary by column.
#' @param true_coefficients a vector of indices of the true non-zero coefficients.
#' @param threshold a scalar value below which a coefficient should be considered
#' equal to zero.
#'
#' @export
#' @return a scalar indicating the proportion of simulations in which the true
#' model was selected exactly.
prob_true <- function(model_coefficients, true_coefficients, threshold){

if(class(model_coefficients == "numeric")){

all(model_coefficients[true_coefficients] > threshold) & !any(model_coefficients[-true_coefficients] > threshold)

} else{

mean(apply(model_coefficients, 2, function(x) all(x[true_coefficients] > threshold)) & !any(model_coefficients[-true_coefficients] > threshold))

}

}

#' Mean Squared Error.
#'
#' Given coefficient estimates from a set of simulations, \code{mse} calculates
#' the mean squared error of the estimates average across all simulations.
#'
#' @param model_coefficients a vector or matrix of coefficient estimates. If a matrix,
#' then the coefficients should vary by row and the simulations should vary by column.
#' @param beta0 a numeric vector containing the true coefficient values.
#' @param threshold a scalar threshold below which coefficients should be set to zero.
mse <- function(model_coefficients, beta0, threshold){

model_coefficients[model_coefficients < threshold] <- 0

if(class(model_coefficients) == "numeric"){
sum((model_coefficients - beta0)^2)
} else{

mean(colSums((model_coefficients - beta0)^2))

}

}
```
penplaqr/penplaqr documentation built on May 6, 2019, 11:44 a.m.