#' gpls: Generalized Projection to Latent Structues
#'
#' This implements partial least squares regression (projection to latent structures) with the addition of
#' poisson and binomial likelihood functions. Only univariate responses are supported.
#'
#' @title gpls: Generalized Projection to Latent Structues
#' @param x argument
#' @param ... other arguments
#' @examples
#' gpls()
#'
#' @rdname gpls
#' @export gpls
gpls <- function(x, ...){
UseMethod("gpls")
}
#' inverse link functions for gpls models
#'
#' @param x input
#' @param family family name
#' @param link link function
#'
#' @export .linkinv_gpls
#' @return the linear predictor transformed to the scale of the observations
#' @examples
#' .linkinv_gpls(x, family="gaussian", link="identity")
#' @keywords internal
.linkinv_gpls <- function(x, family="gaussian", link="identity") {
if (family=="gaussian" ){
if (link=="identity") x
else cat("link function not recognized for ", family, "only identity is available!\n")
}
else if (family=="binomial") {
if (link=="logit") {
ifelse(is.infinite(exp(x)), 1, plogis(x))
}
else if (link=="probit") {
ifelse(is.infinite(exp(x)), 1, pnorm(x))
}
else if (link=="cloglog"){
ifelse(is.infinite(exp(x)), 1, 1-exp(-exp(x)))
}
else if (link=="cauchit") {
thresh <- -qcauchy(.Machine$double.eps)
eta <- pmin(pmax(x, -thresh), thresh)
pcauchy(eta)
}
else if (link=="robit") {
thresh <- -qt(.Machine$double.eps, df = 3)
eta <- pmin(pmax(x, -thresh), thresh)
pt(eta, df = 3)
}
else cat("link function not recognized for ", family, " only logit, probit, cauchit, robit, and cloglog are available!\n")
}
else if (family=="poisson") {
if (link=="log") pmax(.Machine$double.eps, exp(x))
else cat("link function not recognized for ", family, " only log is available!\n" )
}
else if (family=="Gamma") {
if (link=="inverse") pmax(.Machine$double.eps, 1/x)
else cat("link function not recognized for ", family, " only inverse is available!\n" )
}
else if (family=="inverse.gaussian") {
if (link=="1/mu^2") pmax(.Machine$double.eps, 1 / sqrt(x))
else cat("link function not recognized for ", family, " only invsquare is available!\n" )
}
else if (family=="negative.binomial") {
if (link=="log") pmax(.Machine$double.eps, exp(x))
else cat("link function not recognized for ", family, " only log is available!\n" )
}
}
#' utility function for the derivative of linear predictor in gpls models
#'
#' @param eta input
#' @param family family
#' @param link link
#'
#' @export
#' @return the first derivative of the linear predictor
#' @examples
#' .mueta_gpls(eta, family="gaussian", link="identity")
#'
#' @keywords internal
.mueta_gpls <- function(eta, family="gaussian", link="identity") {
if (family=="gaussian" ){
if (link=="identity") rep(1,length(eta))
else cat("link function not recognized for ", family, " only identity is available!\n")}
else if (family=="binomial") {
if (link=="logit") pmax(exp(eta)/(1+exp(eta))^2, .Machine$double.eps)
else if (link=="probit") pmax(dnorm(eta), .Machine$double.eps)
else if (link=="cloglog") pmax(exp(-exp(eta))*exp(eta), .Machine$double.eps)
else if (link=="cauchit") pmax(dcauchy(eta), .Machine$double.eps)
else if (link=="robit") pmax(dt(eta, df = 3), .Machine$double.eps)
else cat("link function not recognized for ", family, " only logit, probit, cauchit, robit, and cloglog are available!\n")
}
else if (family=="poisson") {
if (link=="log") exp(eta)
if (link=="sqrt") 2*eta
else cat("link function not recognized for ", family, " only log is available!\n" )
}
else if (family=="Gamma") {
if (link=="inverse") -1/(eta^2)
else if (link=="identity") rep(1,length(eta))
else if (link=="log") exp(eta)
else cat("link function not recognized for ", family, " only inverse is available!\n" )
}
else if (family=="inverse.gaussian") {
if (link=="1/mu^2") -1/(2*eta^1.5)
else cat("link function not recognized for ", family, " only invsquare is available!\n" )
}
else if (family=="negative.binomial") {
if (link=="log") exp(eta)
else cat("link function not recognized for ", family, " only log is available!\n" )
}
}
#' link function utility for gpls models
#' @param x input
#' @param family family
#' @param link link
#'
#' @export
#' @return the linear predictor
#' @keywords internal
.linkfun_gpls <-function(x, family="gaussian", link="identity") {
if (family=="gaussian" ){
if (link=="identity") x
else cat("link function not recognized for ", family, " only identity is available!\n")
}
if (family=="binomial") {
if (link=="logit") log(x/(1-x))
else if (link=="probit") qnorm(x)
else if (link=="cloglog") log(-log(1-x))
else if (link=="cauchit") qcauchy(x)
else if (link=="robit") qt(x, df = 3)
else cat("link function not recognized for ", family, " only logit, probit, cauchit, robit, and cloglog are available!\n")
}
else if (family=="poisson") {
if (link=="log") log(x)
else cat("link function not recognized for ", family, " only log is available!\n" )
}
else if (family=="Gamma") {
if (link=="inverse") 1/x
else cat("link function not recognized for ", family, " only inverse is available!\n" )
}
else if (family=="inverse.gaussian"){
if (link=="1/mu^2") 1/(x^2)
else cat("link function not recognized for ", family, " only invsquare is available!\n" )
}
else if (family=="negative.binomial") {
if (link=="log") log(x)
else cat("link function not recognized for ", family, " only log is available!\n" )
}
}
#' variance function for glm families in gpls models
#'
#' @param x input
#' @param family family
#' @param link link
#'
#' @export .variance_gpls
#' @return output
#' @examples
#' .variance_gpls(x, family="gaussian", link="identity")
#' @keywords internal
.variance_gpls <- function(x, family="gaussian", link="identity") {
# variance function
if (family=="gaussian"){
if (link=="identity") rep(1,length(x))
else cat("link function not recognized for ", family, " only identity is available!\n")
}
else if (family=="binomial") {
if (link=="logit") exp(x)/(1+exp(x))^2
else if (link=="probit") pnorm(x)*(1-pnorm(x))
else if (link=="cloglog") exp(-exp(x))*(1-exp(-exp(x)))
else if (link=="cauchit") pcauchy(x)*(1-pcauchy(x))
else if (link=="robit") pt(x, df = 3)*(1-pt(x, df = 3))
else cat("link function not recognized for ", family, " only logit, probit, cauchit, robit, and cloglog are available!\n")
}
else if (family=="poisson") {
if (link=="log") exp(x)
else cat("link function not recognized for ", family, " only log is available!\n" )
}
else if (family=="Gamma") {
if (link=="inverse") x^2
else cat("link function not recognized for ", family, " only inverse is available!\n" )
}
else if (family=="inverse.gaussian"){
if (link=="1/mu^2") x^3
else cat("link function not recognized for ", family, " only invsquare is available!\n" )
}
else if (family=="negative.binomial") {
if (link=="log") {
#theta <- uniroot(function(mu,t,y)(mu + (mu^2 / t) - y ), lower = -1e10, upper = 1e10, y = var(exp(x)), mu = mean(exp(x)))$root
#theta <- MASS::theta.mm(x, mu = rep(mean(x), length(x)), dfr = length(x) - 1)
theta.est <- function(x){
x <- round(x, 0)
dnegative.binomial <- function (x, mu = 1, theta = 1, log = FALSE) {
if (any(mu <= 0))
stop(paste("mu must be greater than 0 ", "\n", ""))
if (any(theta <= 0))
stop(paste("theta must be greater than 0 ", "\n", ""))
if (length(theta) > 1)
fy <- ifelse(theta > 1e-04, dnbinom(x, size = mu/theta,
mu = mu, log = log), dpois(x = x, lambda = mu, log = log))
else fy <- if (theta < 1e-04)
dpois(x = x, lambda = mu, log = log)
else dnbinom(x, size = mu/theta, mu = mu, log = log)
fy
}
f <- function(theta) sum(dnegative.binomial(x, mean(x), theta, log = T))
optimize(f, lower = 1, upper = 1000, maximum = TRUE)$maximum
}
theta <- theta.est(x)
exp(x) + exp(x)^2/theta
}
else cat("link function not recognized for ", family, " only log is available!\n" )
}
}
#' initial value utility function for gpls models
#'
#' @param x input
#' @param family glm family
#' @return initial values
#' @examples
#' .yinitfunc(x, family="gaussian")
#' @keywords internal
.yinitfunc <- function(x, family="gaussian") {
# initial value for dependent variable
if (family=="binomial") return((x+0.50) * 0.50)
else if (family=="poisson") return(log(x + 0.01))
else if (family=="negative.binomial") return(log(x + 0.01))
else if (family=="Gamma") return(1/x)
else if (family=="inverse.gaussian") return(1/(x^2))
else if (family=="gaussian") return(x)
else return(x)
}
#' utility function for calculating probabilities in binomial gpls models
#'
#' @param y input
#' @param eta linear predictor
#' @return probabilities
#' @keywords internal
#' @examples
#' .probcalc(y, eta)
#'
.probcalc <- function(y, eta) {
p <- 0
for ( i in 1:length(y))
p[i] <- ifelse(y[i]==1, exp(eta[i])/(1+exp(eta[i])) ,1/(1+exp(eta[i])))
p
}
#' gpls function for internal use
#'
#' fits a gpls object
#'
#' @param X model matrix
#' @param y response
#' @param ncomp number of components
#' @param family glm family
#' @param link glm link
#' @param eps convergence tolerance
#' @param maxit max iterations of iteratively reweighted least squares algorithm
#' @param denom.eps tolerance for small numbers
#' @param firth should Firth's bias correction be applied? defaults to FALSE.
#' @keywords internal
gpls.default <- function(X, y, ncomp = NULL, eps=1e-3, maxit=1000, denom.eps=1e-20, family=NULL, link=NULL, firth=FALSE){
testInteger <- function(x){
test <- all.equal(x, as.integer(x), check.attributes = FALSE)
if(test == TRUE){ return(TRUE) }
else { return(FALSE) }
}
if (is.null(family)){
if (all(y >= 0)){
if (testInteger(y)){
if (length(unique(y)) == 2){
family <- "binomial"
}
else {
family <- "poisson"
}
}
else {
family <- "Gamma"
}
}
else {
family <- "gaussian"
}
}
if (is.null(link)) {
if (family=="gaussian") link <- "identity"
else if (family=="binomial") link <- "logit"
else if (family=="poisson") link <- "log"
else if (family=="negative.binomial") link <- "log"
else if (family=="Gamma") link <- "inverse"
else if (family=="inverse.gaussian") link <- "1/mu^2"
else stop("unknown family", family)
}
X <- as.matrix(X)
dx <- dim(X)
if (family == "binomial"){
if (is.factor(y)) {
levs = levels(y)
if( length(levs) > 2 )
levs = c(levs[1], "Other")
y <- y != levels(y)[1]
}
else levs = unique(y)
if(length(levs) > 2)
warning("y has more than two levels")
if (is.factor(y)){
y <- as.numeric(y) - 1
}
if (is.numeric(y)){
maxy <- max(y)
miny <- min(y)
y[which(y > miny)] <- 1
y[which(y == miny)] <- 0
}
if (is.character(y)){
y <- as.numeric(as.factor(y)) - 1
}
if (any(y < 0 | y > 1))
stop("y values must be 0 <= y <= 1")
}
### number of PLS components
if (is.null(ncomp)) K <- min(dx[1]-2,dx[2])
else if (ncomp > min(dx[1]-1,dx[2])){
cat("Specified number of components exceeds rank of X. \n", "Setting the number of components to ", min(dx[1]-1,dx[2]),"!\n");
K <- min(dx[1]-1,dx[2])
}
else{
K <- ncomp
}
### initialzing weights for PLS regression
## weight matrix for PLS regression
W<- matrix(0,dx[2],K)
## score matrix
Ta <- matrix(0,dx[1],K)
## loading matrix for X
P <- matrix(0,dx[2],K)
## loading matrix for y
Q <- numeric(K)
## initial predictor matrix
E <- X
## initial linear predictor
ystar <- cvreg:::.yinitfunc(y, family)
ystar0 <- ystar
## Weight matrix for glm
V <- diag(c(.mueta_gpls(ystar, family,link)^2/.variance_gpls(ystar, family, link)))
## standardized weights for predictor matrix
E <- t(E)-apply(E,2,weighted.mean,diag(V))
E <- t(E)
## linear predictor
eta <- rep(0,length(y))
eta.old <- eta+10
## setup initial conditions
Q <- rep(0,K)
Q.old <- rep(100,K)
K.old <- K
min <- 1000
beta <- rep(1,ncol(X))
beta.old <- beta/1000
l <- 0; converged <- F
while ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps) & (l < maxit)) {
if ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) < min))
{
if(l==1)
{
W.min <- W
P.min <- P
Q.min <- Q
}
else
{
## record minimum values
min <- max(abs(beta-beta.old)/(abs(beta.old)+denom.eps))
W.min <- W
P.min <- P
Q.min <- Q
eta.min <- eta
l.min <- l
}
}
K.old <- K
l <- l + 1
W <- matrix(0,dx[2],K)
Ta <- matrix(0,dx[1],K)
P <- matrix(0,dx[2],K)
Q <- numeric(K)
### PLS regression
for ( i in 1:K) {
w <- crossprod(E,V)%*%ystar
w <- w/sqrt(crossprod(w)[1])
W[,i] <- w
ta <- E%*%w
Ta[,i] <- ta
taa <- (crossprod(ta,V)%*%ta)[1]
p <- crossprod(E,V)%*%ta/taa
P[,i] <- p
q <- (crossprod(ystar,V)%*%ta/taa)[1]
Q[i] <- q
ystar <- ystar - q*ta
E <- E - tcrossprod(ta,p)
}
## update beta
temp <- pseudoinverse(crossprod(P,W))
if(any(is.na(temp))){
break;
}
else{
beta.old <- beta
beta <- W%*%temp%*%Q
}
## Hat matrix
H <- hat(sweep(cbind(rep(1,dx[1]),X),1,sqrt(diag(V)),"*"),intercept=FALSE)
## update eta (linear predictor)
eta <- weighted.mean(ystar0,diag(V)) + Ta%*%Q
## update and rescaling weight matrix
V <- diag(c(.mueta_gpls(eta, family,link)^2/.variance_gpls(eta, family,link)))
V <- V*(H*firth+1)
## diagnosis for divergence
if(sum(diag(V)=="NaN") > 0){break;}
if (sum(round(cvreg:::.probcalc(y,eta),4)>=0.9999)==length(y)){break;}
## update estimate of linear predictor
ystar <- eta + diag(c(1/.mueta_gpls(eta, family,link)))%*%(y+H*firth/2 - (H*firth+1)*.linkinv_gpls(eta, family,link))/(H*firth +1)
ystar0 <- ystar
## update predictor matrix
E <- t(X)-apply(X,2,weighted.mean,diag(V))
E <- t(E)
## update hat matrix
if (firth){Hat <- H*firth}else{Hat <- H}
}
if (max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps){
W <- W.min; P <- P.min; Q <- Q.min; eta <- eta.min
}
else{
converged <- T
}
## final estimates
beta <- W%*%pseudoinverse(crossprod(P,W))%*%Q
beta0 <- mean(drop(eta)-drop(X%*%beta))
## assign names to the coefficients
coef = c(beta0,beta)
dnx = dimnames(X)[[2]]
if(length(dnx) != length(beta))
dnx = paste("X", 1:length(beta), sep=":")
names(coef) = c("(Intercept)", dnx)
colnames(P) <- c(paste0("Factor.", 1:ncol(P)))
rownames(P) <- names(coef)[-1]
colnames(Ta) <- c(paste0("Factor.", 1:ncol(Ta)))
if (family == "binomial"){
ans = list(coefficients=coef,
convergence = converged,
niter = l,
family = family,
link = link,
ncomp = K,
levs = levs,
firth = firth,
loadings = P,
scores = Ta,
linear.predictor = as.vector(eta),
fitted = as.vector(.linkinv_gpls(eta, family, link)),
y_obs = y,
hat = Hat,
irwls.wts = V,
orthdist = cvreg:::.orthdist(X, P),
scoredist = cvreg:::.scoredist(Ta, apply(Ta, 2, var))
)
}
else if (family == "negative.binomial") {
theta.est <- function(x){
x <- round(x, 0)
dnegative.binomial <- function (x, mu = 1, theta = 1, log = FALSE) {
if (any(mu <= 0))
stop(paste("mu must be greater than 0 ", "\n", ""))
if (any(theta <= 0))
stop(paste("theta must be greater than 0 ", "\n", ""))
if (length(theta) > 1)
fy <- ifelse(theta > 1e-04, dnbinom(x, size = mu/theta,
mu = mu, log = log), dpois(x = x, lambda = mu, log = log))
else fy <- if (theta < 1e-04)
dpois(x = x, lambda = mu, log = log)
else dnbinom(x, size = mu/theta, mu = mu, log = log)
fy
}
f <- function(theta) sum(dnegative.binomial(x, mean(x), theta, log = T))
optimize(f, lower = 1, upper = 1000, maximum = TRUE)$maximum
}
theta <- theta.est(as.vector(.linkinv_gpls(eta, family, link)))
ans = list(coefficients=coef,
convergence = converged,
niter = l,
ncomp = K,
family = family,
link = link,
firth = firth,
loadings = P,
scores = Ta,
theta = theta,
linear.predictor = as.vector(eta),
fitted = as.vector(.linkinv_gpls(eta, family, link)),
y_obs = y,
hat = Hat,
irwls.wts = V,
orthdist = cvreg:::.orthdist(X, P),
scoredist = cvreg:::.scoredist(Ta, apply(Ta, 2, var))
)
}
else {
ans = list(coefficients=coef,
convergence = converged,
niter = l,
ncomp = K,
family = family,
link = link,
firth = firth,
loadings = P,
scores = Ta,
linear.predictor = as.vector(eta),
fitted = as.vector(.linkinv_gpls(eta, family, link)),
y_obs = y,
hat = Hat,
irwls.wts = V,
orthdist = cvreg:::.orthdist(X, P),
scoredist = cvreg:::.scoredist(Ta, apply(Ta, 2, var))
)
}
class(ans) = "gpls"
ans
}
#' Projection to Latent Structues for Generalized Linear Models
#'
#' @param formula model formula
#' @param data a data frame
#' @param ncomp number of components to retain
#' @param eps tolerance
#' @param maxit max iter
#' @param denom.eps tolerance value for denominator to consider a number as zero
#' @param family "gaussian", "poisson", "negative.binomial", "binomial", "multinom", "Gamma", "inverse.gaussian"
#' @param link the link function. see details for available options.
#' @param firth should Firth's bias correction be applied? defaults to FALSE.
#' @param contrasts model contrasts
#' @param ... other
#'
#' @return a gpls object
#' @export gpls
#'
#' @details This function implements what is often called partial least squares for generalized
#' linear models. However, Swedish statisticians Herman Wold and Svante Wold, who invented
#' the method, maintain that the proper name is projection to latent structures. This name is
#' used here because it would be improper to call generalized linear models by the name
#' least squares. As the name implies, PLS works by projecting the predictor matrix to a lower
#' dimensional subspace comprised of latent factors (in the sense of factor analysis, not categorical variables).
#' Essentially, this can save an analytic step. Instead of asking "what factors underlie my variables",
#' and then using the factor scores as predictors, PLS directly answers the question of "what latent factors explain
#' my outcome variable." The returned regression coefficients correspond to the original set of explanatory
#' variables, facilitating inference about the original variables if being used for reasons other than
#' a factor analytic method. \cr
#' \cr
#' PLS regression is useful for a variety of circumstances. These include the following:
#'
#' \itemize{
#' \item multicollinear variables.
#' \item variables believed to be measures of an underlying latent factor (which typically entails multicollinearity)
#' \item regression problems where there are more predictor variables than observations (deficient rank)
#' \item minimizing prediction error in a manner similar to ridge regression
#' \item recovering a set of factors that explain an outcome, a sort of "supervised factor analysis".
#' }
#' \cr
#' \cr
#' Several likelihood functions are implemented here. These include the gaussian, poisson, binomial,
#' gamma, inverse gaussian, and negative binomial distributions. The gaussian distribution is naturally
#' ideal for continuous data. The binomial distribution is utilized for binary outcomes, while the
#' multinomial distribution can model multiple outcomes. The poisson and negative binomial distributions
#' are appropriate for integer count data, with the negative binomial being well suited for overdispersed
#' counts. The gamma distribution and inverse gaussian distributions are appropriate for continuous data
#' with positive support, with the gamma assuming a constant coefficient of variation and the inverse
#' gaussian being suitable for heteroskedastic and/or highly skewed data. \cr
#' \cr
#' The following link functions are available for each distribution: \cr
#' \cr
#' Gaussian: "identity" \cr
#' Binomial & Multinomial: "logit", "probit", "cauchit", "robit" (Student T with 3 df), and "cloglog" \cr
#' Poisson & Negative Binomial: "log" \cr
#' Gamma: "inverse" (1 / x) \cr
#' Inverse Gaussian: "1/mu^2" (1/x^2)
#'
#'
#' @examples
#' gpls(y ~ ., data)
#' @export
#' @return a gpls object containing the model fit, factor loadings, linear predictors, fitted values, and an assortment of other things.
#'
gpls.formula = function(formula, data, ncomp =NULL, eps=1e-3, maxit=100, denom.eps=1e-20, family = NULL, link = NULL, firth=FALSE, contrasts=NULL, ...) {
mf = match.call()
m = match(c("formula", "data"), names(mf), 0)
mf = mf[c(1,m)]
mf[[1]] = as.name("model.frame")
mf = eval(mf, parent.frame())
mt = attr(mf, "terms")
y = model.response(mf)
x = if( !is.empty.model(mt))
model.matrix(mt, mf, contrasts)
else matrix(1, NROW(y), 0)
xint = match("(Intercept)", colnames(x), nomatch=0 )
if(xint > 0 )
x <- x[, -xint, drop=FALSE]
testInteger <- function(x){
test <- all.equal(x, as.integer(x), check.attributes = FALSE)
if(test == TRUE){ return(TRUE) }
else { return(FALSE) }
}
if (is.null(family)){
if (is.factor(y)){
if (length(unique(levels(y))) == 2){
family <- "binomial"
}
else if (length(unique(levels(y))) >=3){
family <- "multinom"
}
} else if (all(y >= 0)){
if (testInteger(y)){
if (length(unique(y)) == 2){
family <- "binomial"
}
else {
family <- "poisson"
}
}
else {
family <- "Gamma"
}
}
else {
family <- "gaussian"
}
}
if (is.null(link)) {
if (family=="gaussian") link <- "identity"
else if (family=="binomial") link <- "logit"
else if (family=="poisson") link <- "log"
else if (family=="negative.binomial") link <- "log"
else if (family=="Gamma") link <- "inverse"
else if (family=="inverse.gaussian") link <- "1/mu^2"
else if (family=="multinom") link <- "robit"
else stop("unknown family", family)
}
if (family == "multinom"){
ans = cvreg:::gplsSoftmax(x, y, ncomp , eps, maxit, denom.eps, link, firth)
} else {
ans = cvreg:::gpls.default(x, y, ncomp , eps, maxit, denom.eps, family, link, firth)
}
if (family != "multinom"){
newlink <- switch(link,
"logit" = "logit",
"probit" = "probit",
"robit" = "cauchit",
"cauchit" = "cauchit",
"cloglog" = "cloglog")
glmfamily <- switch(ans$family,
"gaussian" = gaussian(link = "identity"),
"binomial" = quasibinomial(link = newlink),
"poisson" = poisson(link = "log"),
"Gamma" = Gamma(link = "inverse"),
"negative.binomial" = quasipoisson(link = "log"),
"inverse.gaussian" = inverse.gaussian(link = "1/mu^2")
)
glmfit <- suppressMessages(suppressWarnings(glm(y ~ ., cbind.data.frame(y = ans$y_obs, ans$scores), family = glmfamily)))
ans$pearson.residuals <- residuals(glmfit, type = "pearson")
ans$deviance.residuals <- cvreg:::dev.res(ans$y_obs, ans$fitted, wt = NULL, family = family, ncomp = ncomp)
}
ans$terms = mt
ans$call = match.call()
ans$model.frame <- model.frame(formula, data)
ans$model.matrix <- model.matrix(formula, data)
ans$formula <- formula
ans$link <- link
return(ans)
}
#' Predict method for gpls models
#'
#' @param model the model
#' @param newdata the new data
#' @param type either "link" (untransformed linear predictor), "response" (transformed prediction),
#' or "class" (for binomial models only)
#' @param se.fit should the standard errors of prediction also be returned
#'
#' @return none
#' @method predict gpls
#' @export
#' @keywords internal
predict.gpls <- function(object, newdata = NULL, type = c("response", "link", "class"), se.fit = FALSE){
if (!inherits(object, "gpls"))
stop("object not of class gpls")
tt <- terms(object)
type <- match.arg(type)
family <- object$family
link <- object$link
formula <- object$formula
coefs <- coef(object)
if (is.null(newdata)){
Newdata <- NULL
} else{
if (!is.data.frame(newdata)){
newdata <- as.data.frame(newdata)
}
ox <- object$model.matrix[,-1]
Newdata <- model.matrix(formula, newdata)[,-1]
Newdata <- base::scale(Newdata, colMeans(ox), rep(1, ncol(ox))) %*% object$loadings
Newdata <- as.data.frame(Newdata)
colnames(Newdata) <- paste0("FACTOR", 1:ncol(object$scores))
}
if (family == "binomial"){
newlink <- switch(link,
"logit" = "logit",
"probit" = "probit",
"robit" = "cauchit",
"cauchit" = "cauchit",
"cloglog" = "cloglog")
}
newfamily <- switch(family,
"gaussian" = gaussian(link = "identity"),
"binomial" = binomial(link = newlink),
"poisson" = poisson(link = "log"),
"Gamma" = Gamma(link = "inverse"),
"negative.binomial" = quasipoisson(link = "log"),
"inverse.gaussian" = inverse.gaussian(link = "1/mu^2")
)
dat <- cbind.data.frame(y = object$y_obs, object$scores)
colnames(dat) <- c("y", paste0("FACTOR", 1:ncol(object$scores)))
glmfit <- glm(y ~ ., dat, family = newfamily)
if (type == "class"){
if (family != "binomial"){
stop("type 'predict' is only valid for predicting class labels for binomial models.")
}
else {
preds <- predict(glmfit, newdata = Newdata, type = "response", se.fit = FALSE)
levs <- object$levs
classification <- preds > 0.50
labels <- ifelse(classification, levs[1], levs[2])
return(labels)
}
}
else if (type == "link"){
preds <- predict(glmfit, newdata = Newdata, type = "link", se.fit = se.fit)
}
else if (type == "response"){
preds <- predict(glmfit, newdata = Newdata, type = "response", se.fit = se.fit)
}
return(preds)
}
#' Summary method for gpls models
#'
#' @param model a fitted model object
#' @param se.type One of the robust standard error methods described in \link[cvreg]{vcovHC.gpls}. This defaults
#' to the HC4m method, which I have found to give the most accurate results. Note that typical standard errors are
#' not available for partial least squares. Another option to HC standard errors is to utilize the boot package to bootstrap
#' standard errors.
#' @method summary gpls
#' @return a summary
#' @export
#' @keywords internal
summary.gpls <- function(model, se.type = "HC4m"){
purple = crayon::make_style(rgb(0.565, 0, 0.769))
cat(purple("Call:\n"), deparse(model$call), "\n", sep = "")
cat(crayon::blue("\nLatent Variables: "), deparse(model$ncomp), "\n", sep = "")
cat(crayon::green("\nFamily: "), noquote(model$family), "(link = ", noquote(model$link), ")", "\n", sep = "")
std.err = sqrt(diag(vcovHC(model, type = se.type)))
results = cbind.data.frame(coefs = round(model$coefficients, 3), round(std.err, 4), round(model$coefficients/std.err, 3), round(2 * pnorm(abs(model$coefficients/std.err), lower.tail = F), 5))
colnames(results) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
orange = crayon::make_style(rgb(1, 0.314, 0))
cat(orange("\nCoefficients: \n\n"))
return(results)
}
#' Print method for gpls models
#'
#' @param x the model
#' @param digits digits to round
#' @method print gpls
#' @return none
#' @export
#' @keywords internal
print.gpls = function(x, digits = 3) {
cat("Call:\n", deparse(x$call), "\n", sep = "")
cat(crayon::blue("\nLatent Variables: "), deparse(x$ncomp), "\n", sep = "")
cat(crayon::green("\nFamily: "), noquote(x$family), "(link = ", noquote(x$link), ")", "\n", sep = "")
if (length(coef(x))) {
cat(crayon::magenta("\nCoefficients:\n"))
coef <- round(zapsmall(x$coefficients, 18), digits)
print.default(coef, print.gap = 1, quote = FALSE, digits = digits, )
}
if (x$family == "negative.binomial"){
purpletext <- crayon::make_style(rgb(.51, .098, .635, 0.95), bg = FALSE)
cat(purpletext("\n\nNegative Binomial Excess Dispersion", " : ", round(noquote(x$theta), digits), "\n"))
}
invisible(x)
}
#' gpls function for internal use
#'
#' fits a gpls softmax object
#'
#' @param x model matrix
#' @param y rsponse
#' @param ncomp number of components
#' @param family glm family
#' @param link glm link
#' @param eps convergence tolerance
#' @param maxit max iterations of iteratively reweighted least squares algorithm
#' @param denom.eps tolerance for small numbers
#' @param shrinkage should Firth's bias correction be applied? defaults to FALSE.#'
#' @keywords internal
gplsSoftmax <- function(x, y, ncomp =NULL, eps=1e-3, maxit=100, denom.eps=1e-20, family = "binomial", link, firth=T){
family <- "binomial"
names <- colnames(x)
if (is.numeric(y)){
y.levs <- seq(min(y), max(y), len = length(unique(y)))
all.levs <- y.levs
all.levs <- as.character(all.levs)
y.levs <- y.levs[-1]
y.levs <- as.character(y.levs)
}
if (is.factor(y)){
all.levs <- levels(y)
y.levs <- levels(y)[-1]
y <- as.numeric(y)
}
if (is.character(y)){
y <- as.factor(y)
all.levs <- levels(y)
y.levs <- levels(y)[-1]
y <- as.numeric(y)
}
if(any(x[,1] !=1))
x <- cbind(rep(1,nrow(x)),x)
x <- as.matrix(x)
yv <- as.numeric(as.vector(y))
r <- ncol(x)
C <- max(yv)-1
n <- nrow(x)
y <- matrix(0,n,C+1)
y[cbind(seq(n),yv)] <- 1
y0 <- y[,1]
y <- y[,-1]
jn <- rep(1,n)
jC <- rep(1,C)
y2 <- as.vector(t(y))
x2 <- t(kronecker(rep(1,C),x))
dim(x2) <- c(r,n,C)
x2 <- aperm(x2,c(1,3,2))
dim(x2) <- c(r,n*C)
x2 <- t(x2)
X <- matrix(0,nrow=C*n,ncol=C*r)
for ( i in 1:n)
{
for (j in 1:C)
{
X[((i-1)*C+j),(((j-1)*r+1):((j-1)*r+r))] <- x[i,]
}
}
dx <- dim(X)
if (is.null(ncomp )) K <- min(dx[1]-1,dx[2])
else if ( ncomp > min(dx[1]-1,dx[2]) )
{
cat("Specified number of components exceeds rank of X. \n", "Setting the number of components to ", min(dx[1]-1,dx[2]),"!\n"); break
}
else
{
K <- ncomp
}
### initialzing matrices for PLS regression
## weight matrix for PLS regression
W<- matrix(0,dx[2],K)
## score matrix
Ta <- matrix(0,dx[1],K)
## loading matrix for X
P <- matrix(0,dx[2],K)
## loading matrix for y
Q <- numeric(K)
## initial predictor matrix
E <- X
## Weight matrix for GLM
V <- diag(abs(rst(C*n, nu = 12)), C*n)
## initial linear predictor
ystar <- cvreg:::.yinitfunc(y2, family="binomial")
ystar0 <- ystar
eta.old <- rep(1,length(y2))
eta <- eta.old+100
## set starting point for iterations
Q <- rep(0,K)
Q.old <- rep(100,K)
K.old <- K
min <- 1000
beta <- rep(1,C*r)
beta.old <- beta/1000
converged <- F
l <- 0
l.min <- 1
while ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps) & (l < maxit)){
if ((max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) < min)){
if(l==1){
W.min <- W; P.min <- P; Q.min <- Q
}
else{
## record minimum values
min <- max(abs(beta-beta.old)/(abs(beta.old)+denom.eps))
W.min <- W
P.min <- P
Q.min <- Q
eta.min <- eta
l.min <- l
}
}
K.old <- K
l <- l + 1
W <- matrix(0,dx[2],K)
Ta <- matrix(0,dx[1],K)
P <- matrix(0,dx[2],K)
Q <- numeric(K)
### PLS regression
for ( i in 1:K) {
w <- t(E)%*%V%*%ystar
w <- w/sqrt(crossprod(w)[1])
W[,i] <- w
ta <- E%*%w
Ta[,i] <- ta
taa <- (t(ta)%*%V%*%ta)[1]
p <- t(E)%*%V%*%ta/taa
P[,i] <- p
q <- (t(ystar)%*%V%*%ta/taa)[1]
Q[i] <- q
ystar <- ystar - q*ta
E <- E - ta%*%t(p)
}
## update beta
temp <- pseudoinverse(t(P)%*%W)
if(any(is.na(temp))){
break;
}
else{
beta.old <- beta
beta <- W%*%temp%*%Q
}
## Hat matrix
H <- V%*%X%*%pseudoinverse(t(X)%*%V%*%X)%*%t(X)
## update eta (linear predictor)
eta <- as.vector(t(x%*%matrix(beta,ncol=C,byrow=F)))
p <- exp(x%*%matrix(beta,ncol=C ,byrow=F))
den.p <- 1+as.vector(p%*%jC)
if(any(is.infinite(den.p))){break;}
p <- p2 <- p/den.p
dim(p2) <- c(n,C)
p2 <- as.vector(t(p2))
## update and rescale weight matrix
V <- matrix(0, C*n, C*n)
for(j in seq(n)) {
V[((j - 1) * C + 1):(j * C), ((j - 1) * C + 1):(j * C)] <- diag(p[j,],length(p[j,]))-matrix(p[j,],nrow=C,ncol=1)%*%matrix(p[j,],nrow=1,ncol=C)
}
detadmu <- V
for ( j in seq(n)){
for (i in ((j - 1) * C + 1):(j * C))
for (k in (i:(j*C))){
sum <- sum(p2[((j-1)*C+1):(j*C)])
if(i==k){
detadmu[i,k] <- (1-sum+p2[k])/(p2[k]*(1-sum))
}
else{
detadmu[i,k] <- 1/(1-sum)
detadmu[k,i] <- detadmu[i,k]
}
}
}
Hw <- NULL
for ( j in seq(n)){
sum <- sum(diag(H)[((j - 1) * C + 1):(j * C)])
for (i in ((j - 1) * C + 1):(j * C)){
Hw[i] <- sum + diag(H)[i]
}
}
## update estimate of linear predictor
if(any(is.na(detadmu))){break;}
ystar <- eta + detadmu%*%(y2+diag(H)*firth/2-(Hw*firth/2+1)*p2)/(Hw*firth/2+1)
ystar0 <- ystar
V <- V*(Hw*firth/2+1)
## update predictor matrix
E <- X
}
if (max(abs(beta-beta.old)/(abs(beta.old)+denom.eps)) > eps){
W <- W.min
P <- P.min
Q <- Q.min
}
else{
converged <- T
}
## final estimates
beta <- W%*%pseudoinverse(t(P)%*%W)%*%Q
beta <- matrix(beta,ncol=C ,byrow=F)
rownames(beta) <- c("(Intercept)", names)
colnames(beta) <- y.levs
colnames(P) <- c(paste0("Factor.", 1:ncol(P)))
colnames(Ta) <- c(paste0("Factor.", 1:ncol(Ta)))
out <- list(coefficients=beta,
convergence = converged,
niter = l,
firth = firth,
X = x,
loadings = P,
scores = Ta,
family = "multinom",
link = link,
ncomp = K,
irwls.wts = W,
outcome.levs = all.levs)
return(structure(out, class = "gplsSoftmax"))
}
#' Predict method for softmax (multinomial) gpls models
#'
#' @param model the model
#' @param newdata the new data
#' @param type one of "response" to return probabilities, or "class" to return class labels
#'
#' @return none
#' @method predict gplsSoftmax
#' @export
#' @keywords internal
predict.gplsSoftmax <- function(model, newdata = NULL, type = c("response", "class")) {
type <- match.arg(type)
if (is.null(newdata)){
X <- model$X
}
beta <- model$coefficients
if(all(X[,1] == rep(1,nrow(X))))
{
eta <- X%*%beta
} else
{
eta <- cbind(rep(1,nrow(X)),X)%*% beta
}
p <- exp(eta)
p.denom <- apply(p, 1, function(x) 1+sum(x))
probabilities <- p/p.denom
probabilities <- cbind(1 - rowSums(probabilities), probabilities)
colnames(probabilities) <- model$outcome.levs
if (type == "response")
return(probabilities)
else
return(model$outcome.levs[sapply(1:nrow(probabilities), function(x) which.max(probabilities[x, ]))])
}
#' Print method for gplsSoftmax models
#'
#' @param x the model
#' @param digits digits to round
#'
#' @return none
#' @method print gplsSoftmax
#' @export
#' @examples print(fit)
#' @keywords internal
print.gplsSoftmax = function(x, digits = max(3, getOption("digits") - 3)){
cat("Call:\n", deparse(x$call), "\n", sep = "")
cat("\nLatent Variables: ", deparse(x$ncomp), "\n", sep = "")
cat("\nFamily: ", "multinom(link = ", noquote(x$link), ")", "\n\n", sep = "")
if (length(coef(x))) {
cat("Coefficients:\n")
print.default(x$coefficients, print.gap = 2, quote = FALSE)
}
invisible(x)
}
#' Summary method for gplsSoftmax models
#'
#' @param x the model
#' @param digits digits to round
#'
#' @return none
#' @method summary gplsSoftmax
#' @export
#' @examples summary(fit)
#' @keywords internal
summary.gplsSoftmax = function(x, digits = 3){
cat("Call:\n", deparse(x$call), "\n", sep = "")
cat("\nLatent Variables: ", deparse(x$ncomp), "\n", sep = "")
cat("\nFamily: ", "multinom(link = ", noquote(x$link), ")", "\n\n", sep = "")
if (length(coef(x))) {
cat("Coefficients:\n")
print.default(x$coefficients, print.gap = 2, quote = FALSE)
}
invisible(x)
}
#' Sparse Generalized Projection to Latent Structures
#'
#' @description Sparse projection to latent structures (partial least squares)
#' is an extension of PLS that takes advantage of L1 regularization via a
#' LARS-like algorithm to select predictors. Predictors which do not load highly
#' on any of the latent variables get dropped from the model, and the corresponding
#' regression estimates are shrunk to zero. Here sparse PLS is extended to include
#' generalized linear models. Only univariate outcomes are supported here.
#'
#' @param formula model formula
#' @param data a data frame
#' @param ncomp number of components to retain
#' @param lambda a regularization parameter.
#' @param family "gaussian", "poisson", "negative.binomial", "binomial", "Gamma", or "inverse.gaussian"
#' @param link the link function. see details for available options.
#' @return
#' @export
#' @references
#' Chun, H., & Keles, S. (2010). Sparse partial least squares regression for simultaneous dimension reduction and variable selection. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 72(1), 3–25. doi:10.1111/j.1467-9868.2009.00723.x
#'
sgpls <- function(formula, data, ncomp = 2, lambda = 0.01, family = "gaussian", link = "identity", scale = T){
this.call <- match.call()
if (sum(family %in% c("gaussian", "poisson", "negative.binomial", "binomial", "Gamma", "inverse.gaussian"))==0){
stop("Please supply as a string to the argument 'family' one of gaussian, poisson, negative.binomial, binomial, Gamma, or inverse.gaussian")
}
x <- model.matrix(formula, data)
if (all(x[,1]==1)){
xnames <- colnames(x)[-1]
x <- x[,-1]
}
else{
x <- x; xnames <- colnames(x)
}
y <- model.frame(formula, data);
yname <- colnames(y)[1]
y <- model.response(y)
if (family == "binomial"){
if (is.factor(y)) {
levs = levels(y)
if( length(levs) > 2 )
levs = c(levs[1], "Other")
y <- y != levels(y)[1]
}
else levs = unique(y)
if(length(levs) > 2)
stop("y has more than two levels. sparse multinomial PLS not currently supported by this function")
if (is.factor(y)){
y <- as.numeric(y) - 1
}
if (is.numeric(y)){
maxy <- max(y)
miny <- min(y)
y[which(y > miny)] <- 1
y[which(y == miny)] <- 0
}
if (is.character(y)){
y <- as.numeric(as.factor(y)) - 1
}
if (any(y < 0 | y > 1))
stop("y values must be 0 <= y <= 1")
}
K <- ncomp
n <- nrow(x)
p <- ncol(x)
ip <- c(1:p)
y <- as.matrix(y)
q <- ncol(y)
one <- matrix(1,1,n)
# center & scale y & x
if (family=="gaussian"){
if (scale) y <- scale(y)
}
if (scale) {
x <- scale(x)
}
else{
x <- scale(x,T,F)
}
# initilize objects
betahat <- matrix( 0, p, q )
x1 <- x
y1 <- y
new2keeps <- list()
for (k in 1:K){
# define Z
Z <- t(x1) %*% y1
# fit direction vector
thr <- .threshpls(Z, lambda)
keep <- unique( ip[ thr!=0 | betahat[,1]!=0 ] )
new2keep <- ip[ thr!=0 & betahat[,1]==0 ]
xkeep <- x[,keep,drop=FALSE]
plsfit <- gpls(y ~ ., cbind.data.frame(y=y,xkeep),ncomp=min(k,length(keep)),family=family,link=link)
# update
betahat <- matrix( 0, p, q )
betahat[keep,] <- matrix(coef(plsfit)[-1], length(keep), q)
pj <- plsfit$loadings
pw <- pj %*% tcrossprod(solve(crossprod(pj)), pj)
x1 <- x
x1[,keep] <- x[,keep,drop=FALSE] - x[,keep,drop=FALSE] %*% pw
new2keeps[[k]] <- new2keep
}
coef <- as.vector(betahat); names(coef) <- xnames;
retained.loadings <- pj; retained.x <- x[, which(xnames %in% rownames(pj) == TRUE)]
if (nrow(pj) != p){
keep.names <- rownames(pj)
zero.names <- setdiff(names(coef), keep.names)
pj <- rbind(matrix(0, nrow = length(zero.names) , ncol = ncol(pj)), pj)
rownames(pj) <- c(zero.names, keep.names);
pj <- pj[xnames, ]; wch <- which(rownames(pj) %in% zero.names == TRUE)
retained.loadings <- pj[-wch, ]
}
coef <- c("(Intercept)" = unname(coef(plsfit)[1]), coef)
scores <- retained.x %*% retained.loadings
form <- as.formula(paste(yname, paste0(rownames(retained.loadings), collapse=" + "), sep = " ~ "))
out <- structure(
list(coefficients=coef,
lambda=lambda,
ncomp=K,
loadings=pj,
scores = plsfit$scores,
retained.loadings = retained.loadings,
linear.predictor = plsfit$linear.predictor,
fitted = plsfit$fitted,
pearson.residuals = plsfit$pearson.residuals,
deviance.residuals = plsfit$deviance.residuals,
y_obs = y,
retained.x = retained.x,
family = plsfit$family,
link = plsfit$link,
levs = plsfit$levs,
hat = plsfit$hat,
call = this.call,
formula = form,
model.matrix = model.matrix(form, data),
irwls.weights = plsfit$irwls.weights
)
, class = c("sgpls", "gpls"))
return(out)
}
.threshpls <-function(Z, lambda){
# initialization
p <- nrow(Z)
Znorm1 <- median( abs(Z) )
Z <- Z / Znorm1
b.threshpls <- matrix( 0, length(Z), 1 )
if ( lambda < 1 ){
valb <- abs(Z) - lambda * max( abs(Z) )
b.threshpls[ valb>=0 ] <- valb[ valb>=0 ] * (sign(Z))[ valb>=0 ]
}
return(b.threshpls)
}
#' Calculate deviance residuals for gpls models
#'
#' @param y the observations
#' @param mu the predictions
#' @param wt for internal use only
#' @param family the glm family
#' @param ncomp number of latent variables used in fitting the model
#'
#' @return deviance residuals
#' @keywords internal
#'
dev.res <- function(y, mu, wt = NULL, family, ncomp = NULL){
if (is.null(wt)){
wt <- rep(1, length(y))
}
if (family == "gaussian" ){
d.res <- wt * ((y - mu)^2)
d.res <- ifelse(y > mu, d.res, -d.res)
d.res <- as.vector(d.res)
return(d.res)
}
if (family == "poisson"){
r <- mu * wt
p <- which(y > 0)
r[p] <- (wt * (y * log(y/mu) - (y - mu)))[p]
d.res <- 2 * r
d.res <- ifelse(y > mu, d.res, -d.res)
d.res <- as.vector(d.res)
return(d.res)
}
if (family == "negative.binomial"){
theta.est <- function(x){
x <- round(x, 0)
dnegative.binomial <- function (x, mu = 1, theta = 1, log = FALSE) {
if (any(mu <= 0))
stop(paste("mu must be greater than 0 ", "\n", ""))
if (any(theta <= 0))
stop(paste("theta must be greater than 0 ", "\n", ""))
if (length(theta) > 1)
fy <- ifelse(theta > 1e-04, dnbinom(x, size = mu/theta,
mu = mu, log = log), dpois(x = x, lambda = mu, log = log))
else fy <- if (theta < 1e-04)
dpois(x = x, lambda = mu, log = log)
else dnbinom(x, size = mu/theta, mu = mu, log = log)
fy
}
f <- function(theta) sum(dnegative.binomial(x, mean(x), theta, log = T))
optimize(f, lower = 1, upper = 1000, maximum = TRUE)$maximum
}
theta <- theta.est(mu)
d.res <- 2 * wt * (y * log(pmax(1, y)/mu) - (y + theta) * log((y + theta)/(mu + theta)))
d.res <- ifelse(y > mu, d.res, -d.res)
d.res <- as.vector(d.res)
return(d.res)
}
if (family == "Gamma"){
d.res <- -2 * wt * (log(ifelse(y == 0, 1, y/mu)) - (y - mu)/mu)
d.res <- ifelse(y > mu, d.res, -d.res)
d.res <- as.vector(d.res)
return(d.res)
}
if (family == "inverse.gaussian"){
d.res <- wt * ((y - mu)^2)/(y * mu^2)
d.res <- ifelse(y > mu, d.res, -d.res)
d.res <- as.vector(d.res)
return(d.res)
}
if (family == "binomial"){
d.res <- sqrt(binomial()$dev.res(y = y, mu = mu, wt = wt))
d.res <- ifelse(y > mu, d.res, -d.res)
d.res <- as.vector(d.res)
return(d.res)
}
}
#' Calculate residuals for gpls models
#'
#' @param object the model
#' @param type either deviance or pearson
#' @param wts should typically be left as NULL.
#'
#' @return deviance, pearson, or working residuals
#' @method residuals gpls
#' @export
#' @examples
#' residuals(fit, type = "pearson")
#' @keywords internal
residuals.gpls <- function(object, type = c("pearson", "deviance", "working"), wts = NULL){
type <- match.arg(type)
mu <- object$fitted
y <- object$y_obs
scores <- object$scores
if (is.null(wts)){
wts <- rep(1, length(y))
}
newlink <- switch(object$link,
"logit" = "logit",
"probit" = "probit",
"robit" = "cauchit",
"cauchit" = "cauchit",
"cloglog" = "cloglog")
glmfamily <- switch(object$family,
"gaussian" = gaussian(link = "identity"),
"binomial" = quasibinomial(link = newlink),
"poisson" = poisson(link = "log"),
"Gamma" = Gamma(link = "inverse"),
"negative.binomial" = quasipoisson(link = "log"),
"inverse.gaussian" = inverse.gaussian(link = "1/mu^2")
)
if (type == "pearson"){
fit <- suppressMessages(suppressWarnings(glm(y ~ ., cbind.data.frame(y = y, scores), family = glmfamily)))
resids <- residuals(fit, type = "pearson")
return(resids)
}
if (type == "working"){
fit <- suppressMessages(suppressWarnings(glm(y ~ ., cbind.data.frame(y = y, scores), family = glmfamily)))
resids <- residuals(fit, type = "working")
return(resids)
}
else if (type == "deviance"){
resids <- cvreg:::dev.res(y, mu, wt = wts, family = object$family, ncomp = object$ncomp)
resids <- as.vector(resids)
return(resids)
}
}
#' Confidence Intervals for Model Parameters
#'
#' @param object a gpls fitted object
#' @param conf.level the confidence level required. the default is 95 pct.
#' @param se.type which variant of the Huber-White robust standard errors should be used. Defaults to "HC4m".
#' @param ... other
#' @method confint gpls
#' @return A matrix with columns giving lower and upper confidence limits for each parameter.
#' @export
#' @keywords internal
confint.gpls <- function (object, conf.level = 0.95, se.type = "HC4m", ...) {
tol <- 1e-10
se <- sqrt(diag(vcovHC(object, type = se.type)))
coef <- object$coefficients
p <- length(coef)
qnorm_factory <- function(mu, sigma){
function(p) qnorm(p, mean = mu, sd = sigma)
}
ci <- matrix(0, nrow = p, ncol = 2)
calculate_ci <- function(ICDF, conf.level, tol){
intervalWidth <- function(lowTailPr, ICDF, conf.level , ...) {
ICDF(conf.level + lowTailPr, ...) - ICDF(lowTailPr, ...)
}
optInfo <- optimize(intervalWidth, c(0, 1 - conf.level ), ICDF = ICDF,
conf.level = conf.level , tol = tol, ...)
HDIlowTailPr <- optInfo$minimum
result <- c(lower = ICDF(HDIlowTailPr, ...), upper = ICDF(conf.level +
HDIlowTailPr, ...))
attr(result, "conf.level ") <- conf.level
return(result)
}
for (i in 1:p){
q <- qnorm_factory(mu = coef[i], sigma = se[i])
ci[i, ] <- calculate_ci(q, conf.level, tol)
}
colnames(ci) <- c(
stats:::format.perc((1 - (conf.level)) / 2, 3),
stats:::format.perc(conf.level + (1 - (conf.level)) / 2, 3)
)
rownames(ci) <- names(coef)
return(ci)
}
#' Choose optimal number of latent variables for gpls models
#'
#' @description This evaluates the number of latent variables that leads to the best fit as judged by the AICc, AIC, or BIC metric.
# This is an alternative to cross-validation that can be used to pick a good fit using less computational resources and time.
#'
#' @param formula model formula
#' @param data a data frame
#' @param eps tolerance
#' @param maxit max iter
#' @param denom.eps tolerance value for denominator to consider a number as zero
#' @param family "gaussian", "poisson", "negative.binomial", "binomial", "multinom", "Gamma", "inverse.gaussian"
#' @param link the link function. see details for available options.
#' @param firth should shrinkage be applied? Defaults to FALSE.
#' @param contrasts model contrasts
#' @param criterion one of AICc, AIC, BIC, or BICc.
#' @param plot Should the results be plotted? Defaults to TRUE.
#'
#' @return a numeric value
#' @export
#'
rankest.gpls <- function(formula, data, eps=1e-3, maxit=1000, denom.eps=1e-20, family = NULL, link = NULL, firth=FALSE, contrasts=NULL, criterion = c("AICc", "AIC", "BIC", "BICc"), plot = TRUE){
criterion <- match.arg(criterion)
x = model.matrix(formula, data)[,-1]
score <- rep(0, ncol(x))
for (i in 1:ncol(x)){
if (criterion == "AIC"){
score[i] <- AIC(gpls(formula, data, ncomp = i, eps, maxit, denom.eps, family, link, firth, contrasts), correction = F)
}
else if (criterion == "AICc"){
score[i] <- AIC(gpls(formula, data, ncomp = i, eps, maxit, denom.eps, family, link, firth, contrasts), correction = T)
}
else if (criterion == "BIC"){
score[i] <- BIC(gpls(formula, data, ncomp = i, eps, maxit, denom.eps, family, link, firth, contrasts), correction = F)
}
else if (criterion == "BICc"){
score[i] <- BIC(gpls(formula, data, ncomp = i, eps, maxit, denom.eps, family, link, firth, contrasts), correction = T)
}
}
dat <- data.frame("k variables" = 1:ncol(x), "information.criterion" = score)
colnames(dat) <- c("k variables", criterion)
if (plot){
plot(ggplot(dat, aes_string(x = "`k variables`", y = paste0(criterion, "\n"))) +
geom_point(color = "#268fff", alpha = 0.94, shape = 1) +
geom_line(color = "#268fff", alpha = 0.85, size = 0.65))
}
star <- rep(" ", length(score))
wch <- which.min(dat[,2])
star[wch] <- "*"
dat <- cbind.data.frame(dat, optimal.fit = star)
return(dat)
}
#' plot method for gpls objects
#'
#' @param fit model fit
#' @param type one of "facet", "splom", or "fitted". facet plots each sufficient predictor's
#' relationship with the response. splom plots the same as facet but also the inter-relationships
#' of the sufficient predictors as a scatterplot matrix. fitted plots the relationship between
#' the fitted values of y and the observed values.
#'
#' @return a plot
#' @method plot gpls
#' @export
#' @keywords internal
plot.gpls <- function(fit, type = c("facet", "splom", "fitted")){
type <- match.arg(type)
if (type == "facet"){
dat <- cbind.data.frame(y = fit$y_obs, fit$scores)
dat <- as.data.frame(pivot_longer(dat, -y))
ggplot(dat, aes(x = value, y = y, color = name)) +
facet_wrap(~ name, ncol = ifelse(length(unique(dat$name)) < 4, length(unique(dat$name)), 2)) +
geom_point(shape = 1, alpha = 0.90) +
theme(legend.position = "none")
}
else if (type == "splom"){
dat <- cbind.data.frame(y = fit$y_obs, fit$scores)
scatMat(dat, smooth = F, smoother = F, points.col = "#3293f3BF", pch = 1)
}
else if (type == "fitted"){
dat <- cbind.data.frame(y = fit$y_obs, fit$scores)
ggplot(dat, aes(x = yhat, y = yobs)) +
geom_point(shape = 1, alpha = 0.90, color = "#e70e4e")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.