R/sdrtools.R

Defines functions .mopwtsqrtinv .mopwtconstant MOP crSpline gpSpline discretize rankest.grpglm rankest.grpols rankest.henv rankest.env hyptest.sdr rankest.sdr print.sdr plot.sdr predict.sdr

Documented in crSpline discretize gpSpline MOP plot.sdr predict.sdr print.sdr rankest.env rankest.grpglm rankest.grpols rankest.henv rankest.sdr

#' predict method for sdr objects
#'
#' @description This is a predict function for any of the sdr models in this package. It can
#' project new data onto the lower dimensional space to obtain a set of sufficient predictors
#' for testing data, or it can be used to obtain linear predictions of the sufficient predictors.
#' For nonlinear models note that you might be better off fitting a generalized additive model (GAM)
#' or generalized additive model for location, scale, and shape (GAMLSS) using the sufficient predictors
#' as covariates to the original response variable. After that, this function can be used to project
#' new testing data onto the lower dimensional space, and then using the predict function for the GAM/GAMLSS
#' to obtain the response variable predictions for the new data.
#'
#' @param fit model fit
#' @param newdata a data frame of new data. Note that the new data frame must contain observations of the response variable.
#' @param type one of "response" (the default, returns the predicted values) or "SP" (returns the new predictions )
#'
#' @return a plot
#' @method predict sdr
#' @export
#'
predict.sdr <- function(fit, newdata = NULL, type = c("response", "SP")){

  type <- match.arg(type)

  if (type == "response"){
    if (is.null(newdata)){
      fitted(fit)
    }
    else{
      newpredictors = as.matrix(newdata) %*% as.matrix(fit$basis)
      colnames(newpredictors) = paste0("SP",1:ncol(newpredictors))
      X = as.matrix(newpredictors)
      form = as.formula(paste0("y ~ 0 + ", paste0("SP", 1:ncol(newpredictors),collapse="+")))
      betas = lmSolve(form, cbind.data.frame(y = fit$fitted, fit$predictors))
      y.pred = as.vector(X %*% betas)
      return(y.pred)
    }
  }
  else if (type == "SP"){
    if (is.null(newdata)){
      fit$predictors
    }
    else{
      as.matrix(newdata) %*% as.matrix(fit$basis)
    }
  }
}


#' plot method for sdr objects
#'
#' @param fit model fit
#' @param type one of "facet", "splom", or "fitted". facet plots each sufficient predictor's
#' relationship with the response. splom plots the same as facet but also the inter-relationships
#' of the sufficient predictors as a scatterplot matrix. fitted plots the relationship between
#' the fitted values of y and the observed values.
#' @param for "facet" and "splom", select which sufficient predictors to plot. If left as NULL, all are plotted.
#' @param smooth should a smooth line be fit to the plots? Defaults to FALSE.
#'
#' @return a plot
#' @method plot sdr
#' @export
#'
plot.sdr <- function(fit, type = c("facet", "splom", "fitted"), wch = NULL, smooth = F){

  type <- match.arg(type)

  if (type == "facet"){
    if (is.null(wch)) wch = 1:ncol(fit$predictors)
    dat <- cbind.data.frame(y = fit$y.obs, fit$predictors[,wch])
    dat <- as.data.frame(tidyr:::pivot_longer(dat, -y))
    dat$name <- factor(dat$name, levels = paste0("SP", 1:length(unique(dat$name))))
    g = ggplot(dat, aes(x = value, y = y, color = name)) +
      facet_wrap(~ name, ncol = ifelse(length(unique(dat$name)) < 4, length(unique(dat$name)), 2), scales="free") +
      geom_point(shape = 1, alpha = 0.95, size = 2) +
      theme(legend.position = "none")

    if (smooth){
      g + geom_line(stat = "smooth", method = RobStatTM::lmrobdetMM, formula = y ~ splines::ns(x, df = 3),
                    alpha = 0.60, size = 1, se = F)
    }
    else{
      g
    }

  }
  else if (type == "splom"){
    if (is.null(wch)) wch = 1:ncol(fit$predictors)
    dat <- cbind.data.frame(y = fit$y.obs, fit$predictors[,wch])
    scatMat(dat, smooth = smooth, smoother = F, points.col = "#3293f3BF", pch = 1)
  }
  else if (type == "fitted"){
    dat <- cbind.data.frame(yobs = fit$y.obs, yhat = fit$fitted)
    g = ggplot(dat, aes(x = yhat, y = yobs)) +
      geom_point(shape = 1, alpha = 0.95, color = "#e70e4e", size = 2)

    if (smooth){
      g + geom_line(stat = "smooth", method = RobStatTM::lmrobdetMM, formula = y ~ splines::ns(x, df = 3),
                    alpha = 0.60, size = 1, se = F)
    }
    else{
      g
    }
  }
}

#' print method for sdr objects
#'
#' @param fit model fit
#'
#' @return text
#' @export
#' @method print sdr
#' @keywords internal
#'
print.sdr <- function(fit){
  orangered <- crayon::make_style(rgb(1, 0.291, 0.018))
  brightblue <- crayon:::make_style(rgb(0.12, 0.76, 0.712))
  if (attr(fit, "model") == "grpOLS" || attr(fit, "model") == "grpGLM"){
    cat(orangered("\nBasis Vectors:"), "\n\n")
    Matrix::printSpMatrix(Matrix::Matrix(round(fit$basis, 4), sparse = TRUE), zero.print = " ", digits = 3, col.names=TRUE)
  }
  else{
     cat(orangered("\nBasis Vectors:"), "\n\n")
     print(round(fit$basis, 4), digits = 4)
  }

  if (attr(fit, "model") == "grpOLS" || attr(fit, "model") == "grpGLM"){
    bink <- crayon::make_style(rgb(0.57,0.39,1))
    cat(bink("\nEstimated Regression Coefficients\n\n"))
    ols = round(signif(as.vector(t(rowSums(as.matrix(fit$coef)))), 4), 3)
    olsnames = abbreviate(rownames(fit$coef),method = "both.sides", named = FALSE, minlength = 5)
    names(ols) = olsnames
    print(ols, digits = 2)
  }
  else if (attr(fit, "model") == "ENV"){
    bink <- crayon::make_style(rgb(0.57,0.39,1))
    cat(bink("\nEstimated Regression Coefficients\n\n"))
    if (is.vector(fit$coef) || ncol(fit$coef) == 1){
      coefs <- as.vector(round(fit$coef, 4))
      names(coefs) <- names(fit$coef)
      print(coefs, digits = 3)
    }
    else{
      coefs <- round(fit$coef, 4)
      print(round(coefs, 4), digits = 3)
    }
  }
  else if (attr(fit, "model") == "HENV"){
    bink <- crayon::make_style(rgb(0.57,0.39,1))
    cat(bink("\nEstimated Regression Coefficients\n\n"))
    print(fit$coef, digits = 3)
    ogreen <- crayon::make_style(rgb(0.714, 0.712, 0.099))
    cat(ogreen("\nEstimated Group Means\n\n"))
    print(fit$means, digits = 3)
  }
  else{
    cat("\n\n", brightblue("Eigenvalues of Estimated Dimension Reduction Subspace:", "\n\n"))
    print(fit$evalues)
  }
}

#' Estimate the true number of dimensions for sdr models
#'
#' @description This offers two different BIC-type statistics for the selection of
#' ranks. The ZMP and LAL variants are offered here (Zhu, Miao, & Peng,
#' 2006; Li, Artemiou, & Li, 2011). Note, however, that despite the
#' BIC-type penalty, these statistics do not include the -2 multiplier. Hence, the
#' largest value is optimal rather than the smallest as in the BIC.
#'
#' @param fit the model fit
#' @param criterion one of "zmp" (the default) or "lal"
#' @param max.rank the maximum rank to consider. defaults to all of them.
#' @param plot should the BIC-curve be plotted? defaults to TRUE.
#'
#' @return a list
#' @export
#'
#' @references
#' Zhu, L., Miao, B., & Peng, H. (2006). On Sliced Inverse Regression With High-Dimensional Covariates. Journal of the American Statistical Association, 101(474), 630–643. doi:10.1198/016214505000001285 \cr
#' \cr
#' Li, B., Artemiou, A., & Li, L. (2011). Principal support vector machines for linear and nonlinear sufficient dimension reduction. The Annals of Statistics, 39(6), 3182–3210. doi:10.1214/11-aos932
#'
rankest.sdr <- function(fit, criterion = c("zmp", "lal"), max.rank = ncol(fit$mf[,-1]), plot = T){

  if (attr(fit, "model") == "ENV" || attr(fit, "model") == "HENV"){
    return(cat(crayon::red("Please use the rankest.env or rankest.henv function.")))
  }
  if (attr(fit, "model") == "grpOLS"){
    return(cat(crayon::red("Please use the rankest.grpols function.")))
  }
  criterion <- match.arg(criterion)
  maximizer=function(x,y) return(x[order(y)[length(y)]])
  n = length(fit$y.obs);
  p = ncol(fit$mf[,-1])
  evals=fit$evalues
  bic=vector()
  if(criterion=="lal"){
    for(k in 0:max.rank){
      if(k==0) {
        bic=c(bic,0)
      }
      else if (k != 0){
        bic=c(bic,sum(evals[1:k])-(2*evals[1])*n^(-1/2)*(log(n))^(1/2)*k)
        names(bic) <- paste0("d=", 0:(length(bic)-1))
      }
    }
    if (plot){
      size = ifelse(length(bic) <= 20, 1, 0.75)
      plot(bic, xlab = "rank", ylab = "lal", xaxt = "n", type = "b", cex = size, lwd = 1.125)
      axis(1, labels = 0:(length(bic)-1), at = 1:length(bic))
      points(x = which.max(bic), y = max(bic), col = "red", pch = 19, cex= size)
    }
    return(list(rank.est = maximizer(0:p,bic), bic.lal = bic))
  }
  if(criterion=="zmp"){
    for(k in 0:(max.rank-1)){
      c1=(evals[1]/3)*(0.5* log(n)+0.1*n^(1/3))/(2*n)
      c2=k*(2*p-k+1)
      ll=sum(log(evals[(k+1):p]+1)-evals[(k+1):p])
      bic=c(bic,ll-c1*c2)
    }
    bic=c(bic,-c1*p*(2*p-p+1))
    names(bic) <- paste0("d=", 0:(length(bic)-1))
    if (plot){
      size = ifelse(length(bic) <= 20, 1, 0.75)
      plot(bic, xlab = "rank", ylab = "zmp", xaxt = "n", type = "b", cex = size, lwd = 1.125)
      axis(1, labels = 0:(length(bic)-1), at = 1:length(bic))
      points(x = which.max(bic), y = max(bic), col = "red", pch = 19, cex = size)
    }
    return(list(rank.est = maximizer(0:p, bic), bic.zmp = bic))
  }
}



hyptest.sdr <- function(object, numdir=ncol(coefficients), ...) {
  ev<-sort(object$evalues)
  p<-length(ev)
  n<-length(fit$y.obs)
  L<-df<-p.value<-0
  nt <- min(p,numdir)
  for (i in 0:(nt-1)){
    L[i+1]<-n*(p-i)*mean(ev[seq(1,p-i)])
    df[i+1]<-(p-i)*sum(object$slices-i-1)
    p.value[i+1]<-1-pchisq(L[i+1], df[i+1])
  }
  out<-data.frame(cbind(round(L,5),df,p.value))
  rr<-paste(0:(nt-1),"R vs >= ",1:nt,"R",sep="")
  dimnames(out)<-list(rr,c("L", "df", "p.value"))
  return(out)
}



#' Estimate dimensions for an ENV model
#'
#' @param formula model formula
#' @param data data frame
#' @param plot whether to plot. defaults to TRUE.
#'
#' @return a list and plot
#' @export
#'
rankest.env <- function(formula, data, plot = T){
  X = model.matrix(formula, data)[,-1]
  Y = model.frame(formula, data)
  Y = model.response(Y)
  XX <- as.factor(X)
  if (is.vector(Y)){r = 1;n=length(Y)} else{a <- dim(Y);n <- a[1];r <- a[2]}
  p = ncol(X)
  ll = sapply(1:p, function(i) ENV(formula, data, rank = i, se = F)$loglik)
  npara.seq <- r * (1:p) + r + p - 1 + p * (p + 1)/2 + r * (r + 1)/2
  bic <- -2 * ll + log(n) * npara.seq
  if (plot){
    size = ifelse(length(bic) <= 20, 1, 0.75)
    plot(bic, xlab = "rank", ylab = "bic", xaxt = "n", type = "b", cex = size, lwd = 1.125)
    axis(1, labels = 1:length(bic), at = 1:length(bic))
    points(x = which.min(bic), y = min(bic), col = "red", pch = 19, cex = size)
  }
  return(list(rank.est = which.min(bic), bic = bic))
}


#' Estimate dimensions for a HENV model
#'
#' @param formula model formula
#' @param data data frame
#' @param plot whether to plot. defaults to TRUE.
#'
#' @return a list and plot
#' @export
#'
rankest.henv <- function(formula, data, plot = T){
  X = model.matrix(formula, data)[,-1]
  Y = model.frame(formula, data)
  Y = model.response(Y)
  XX <- as.factor(X)
  a <- dim(Y)
  n <- a[1]
  r <- a[2]
  p <- nlevels(XX)
  paranum = loglik.seq <- c()
  for (u in 1:r) {
    loglik.seq[u] <- HENV(formula, data, rank = u, se = FALSE)$loglik
    paranum[u] <- (r - u) + u * (r - u + p) + p * u *
      (u + 1)/2 + (r - u) * (r - u + 1)/2
  }
  bic <- -2 * loglik.seq + log(n) * paranum
  if (plot){
    size = ifelse(length(bic) <= 20, 1, 0.75)
    plot(bic, xlab = "rank", ylab = "bic", xaxt = "n", type = "b", cex = size, lwd = 1.125)
    axis(1, labels = 1:(length(bic)), at = 1:length(bic))
    points(x = which.min(bic), y = min(bic), col = "red", pch = 19, cex = size)
  }
  return(list(rank.est = which.min(bic), bic = bic))
}

#' Estimate dimensions for a group OLS, Qreg, or Theil-Sen model
#'
#' @param X the covariates
#' @param Y the response
#' @param idx the group assignment for each covariate
#' @param method one of "ols", "quantile", or "theil-sen".
#' @param q a quantile if method is "quantile". defaults to 0.50.
#' @return a list
#' @export
#'
rankest.grpols <-function(X, Y, idx, method = c("ols","quantile","theil-sen"), q = 0.5){
  method <- match.arg(method)
  method <- switch(method,
                    "ols" = grpOLS,
                    "quantile" = function(X,Y,idx,ranks){grpQreg(X,Y,idx,ranks,q)},
                    "theil-sen" = grpTheilSen
  )

  n<-nrow(X)
  ngrp<-length(unique(idx))
  dsw<-rep(1, ngrp)
  d<-rep(0, ngrp)
  mu = apply(X, 2, mean)
  sig = var(X)
  signrt = matpower(sig, -1/2)
  Z <- t(t(X) - mu) %*% signrt
  Y <- (Y-mean(Y))/sd(Y)
  out.d <- method(Z,Y,idx,dsw)
  M <- t(as.matrix(out.d$coef)) %*% (as.matrix(out.d$coef))
  vecM <- diag(M)
  orderM <- order(vecM, decreasing = TRUE)
  crit <- -1/(n^(1/ngrp)*log(n))
  for (i in 1:length(vecM)){
    crit.d <- sum(vecM[orderM][1:i]) - (i+1)/(n^(1/ngrp)*log(n))
    crit<-c(crit, crit.d)
  }
  if (which(crit == max(crit)) > 1){
    d[orderM[1:(which(crit == max(crit))-1)]] <- 1
  }
  ans<-list(rankest=d, bic=crit)
  return(ans)
}


#' Estimate dimensions for a (robust) group GLM model
#'
#' @param X the covariates
#' @param Y the response
#' @param idx the group assignment for each covariate
#' @param family a glm family compatible with the chosen method.
#' @param method one of "glm" or "rob".
#' @return a list
#' @export
#'
rankest.grpglm <-function(X, Y, idx, family = gaussian(), method = c("glm","rob")){
  method <- match.arg(method)
  method <- switch(method, "glm" = grpGLM, "rob" = grpRobGLM)
  n<-nrow(X)
  ngrp<-length(unique(idx))
  dsw<-rep(1, ngrp)
  d<-rep(0, ngrp)
  mu = apply(X, 2, mean)
  sig = var(X)
  signrt = matpower(sig, -1/2)
  Z <- t(t(X) - mu) %*% signrt
  out.d <- method(Z,Y,idx,dsw,family=family)
  M <- t(as.matrix(out.d$coef)) %*% (as.matrix(out.d$coef))
  vecM <- diag(M)
  orderM <- order(vecM, decreasing = TRUE)
  crit <- -1/(n^(1/ngrp)*log(n))
  for (i in 1:length(vecM)){
    crit.d <- sum(vecM[orderM][1:i]) - (i+1)/(n^(1/ngrp)*log(n))
    crit<-c(crit, crit.d)
  }
  if (which(crit == max(crit)) > 1){
    d[orderM[1:(which(crit == max(crit))-1)]] <- 1
  }
  ans<-list(rankest=d, bic=crit)
  return(ans)
}



#' Discretize a numeric variable for SIR and SAVE
#'
#'
#' @description returns integers indicating bin assignments for a continuous random variable "x"
#' based on the quantiles of "x".
#'
#' @param x variable
#' @param q number of cuts
#'
#' @return a numeric vector
#' @export
#'
discretize <- function(x, q=4){
  na.rm=TRUE
  q <- seq(0.025,0.975,length.out=q+1)
  quant <- hdquantile(x, probs = q)
  quant <- c(min(x), quant[-c(1, length(quant))], max(x))
  dups <- duplicated(quant)
  if(any(dups)){
    flag <- x %in% unique(quant[dups])
    retval <- ifelse(flag,paste("[", as.character(x),"]", sep=''), NA)
    uniqs <- unique(quant)
    reposition <- function(cut){
      flag <- x>=cut
      if(sum(flag, na.rm=na.rm)==0)
        return(cut)
      else
        return(min(x[flag], na.rm=na.rm))
    }
    newquant <- sapply(uniqs, reposition)
    retval[!flag] <- as.character(cut(x[!flag], breaks=newquant,include.lowest=TRUE))
    levs <- unique(retval[order(x)])
    retval <- factor(retval, levels=levs)
    mkpairs <- function(x) sapply(x, function(y) if(length(y)==2) y[c(2,2)] else y[2:3])
    pairs <- mkpairs(strsplit(levs, '[^0-9+\\.\\-]+'))
    rownames(pairs) <- c("lower.bound","upper.bound")
    colnames(pairs) <- levs
    closed.lower <- rep(F,ncol(pairs))
    closed.upper <- rep(T,ncol(pairs))
    closed.lower[1] <- TRUE
    for(i in 2:ncol(pairs)){if(pairs[1,i]==pairs[1,i-1] && pairs[1,i]==pairs[2,i-1]) closed.lower[i] <- FALSE}
    for(i in 1:(ncol(pairs)-1)){if(pairs[2,i]==pairs[1,i+1] && pairs[2,i]==pairs[2,i+1]) closed.upper[i] <- FALSE}
    levs <- ifelse(pairs[1,]==pairs[2,], pairs[1,],paste(ifelse(closed.lower,"[","("), pairs[1,],",",pairs[2,],ifelse(closed.upper,"]",")"),sep=''))
    levels(retval) <- levs
  }
  else
    retval <- cut(x, quant, include.lowest=TRUE)
  retval <- as.numeric(retval)
  return(retval)
}




#' Generate a Basis Matrix for a Gaussian Process
#'
#' @description The function has a usage similar to \code{\link{bs}} and \code{\link{ns}} in the \pkg{splines} package.
#' However, this function utilizes \code{\link[=smooth.construct.gp.smooth.spec]{smooth constructor}} in the package \pkg{mgcv}
#' to construct a gaussian process basis matrix.
#'
#' @param x the predictor variable. Missing values are allowed.
#' @param df degrees of freedom of the basis matrix. The default is 5. The minimum allowed is 3.
#' @param knots breakpoints that define the spline. by default these are automatically selected,
#' and not defined by the user.
#' @param intercept If TRUE, an intercept is included in the basis matrix.
#' @param fx If TRUE, it removes the penalization.
#' @param S penalty matrix, defined internally if NULL (the default).
#' @param m a numeric vector. selects the covariance function, sets the scale parameter
#' and, if applicable, shape parameter. m[1] between 1 and 5 selects the correlation
#' function from  respectively, spherical, power exponential, and Matern with kappa = 1.5,
#' 2.5 or 3.5. m[2] if present specifies the scale parameter, with non-positive or absent
#' indicating that the Kammann and Wand estimate should be used. m[3] can be used to
#' specify the shape parameter for the power exponential function. See
#' \code{\link{smooth.construct.gp.smooth.spec}} for more details. the default option
#' here is c(3,-1,1.4) which indicates the power exponential covariance function with
#' the Kammann and Wand scale estimate, and a shape parameter of 1.4.
#'
#' @return A matrix with class \code{"gp"} and class \code{"basis"}.
#' @export

gpSpline <- function(x, df=5, knots=NULL, intercept=FALSE, fx= FALSE, S=NULL, m = c(3,-1,1.4)){
  #
  ################################################################################
  #
  nx <- names(x)
  x <- as.vector(x)
  nax <- is.na(x)
  if(nas <- any(nax)) x <- x[!nax]
  #
  # DEFINE KNOTS AND DF
  if(is.null(knots)) {
    if(df<3) stop("'df' must be >=3")
    knots <- try(hdquantile(unique(x), seq(0,1,length=df+!intercept)))
    if (class(knots)=="try.error"){knots <- quantile(unique(x), seq(0,1,length=df+!intercept))}
  } else df <- length(knots) - !intercept
  #
  # CHECK NUMBER OF UNIQUE x VALUES
  # (ADD SOME IF NEEDED TO PREVENT ERROR IN smooth.construct.cr.smooth.spec)
  if(add <- length(unique(x)) < length(knots))
    x <- c(seq(min(knots), max(knots), length=length(knots)), x)
  #
  # TRANSFORMATION: CALL FUNCTION FROM MGCV
  oo <- mgcv:::smooth.construct.gp.smooth.spec(mgcv:::s(x, bs="gp", k=df+!intercept, m = m),
                                               data=list(x=x), knots=list(x=knots))
  basis <- oo$X
  if(!intercept) basis <- basis[,-1L,drop=FALSE]
  #
  # REMOVE ADDED VALUES AND RE-INSERT MISSING
  if(add) basis <- basis[-seq(length(knots)),,drop=FALSE]
  if(nas) {
    nmat <- matrix(NA, length(nax), ncol(basis))
    nmat[!nax,] <- basis
    basis <- nmat
  }
  #
  # RELATED PENALTY MATRIX
  if(fx) {
    S <- NULL
  } else if(is.null(S)) {
    S <- oo$S[[1]]
    S <- (S+t(S))/2
    if(!intercept) S <- S[-1L,-1L,drop=FALSE]
  } else if(any(dim(S)!=ncol(basis))) stop("dimensions of 'S' not compatible")
  #
  # NAMES AND ATTRIBUTES
  dimnames(basis) <- list(nx, seq(ncol(basis)))
  attributes(basis) <- c(attributes(basis), list(df=df,
                                                 knots=knots,
                                                 intercept=intercept,
                                                 fx=fx,
                                                 S=S))
  class(basis) <- c("matrix", "gp","basis")
  return(basis)
}



#' Generate a Basis Matrix for Penalized Cubic Regression Splines
#'
#' @description The function has a usage similar to \code{\link{bs}} and \code{\link{ns}} in the \pkg{splines} package.
#' However, this function utilizes \code{\link[=smooth.construct.cr.smooth.spec]{smooth constructor}} in the package \pkg{mgcv}
#' to construct cubic regression splines with an optional penalty.
#'
#' @param x the predictor variable. Missing values are allowed.
#' @param df degrees of freedom of the basis matrix. The default is 5. The minimum allowed is 3.
#' @param knots breakpoints that define the spline. by default these are automatically selected,
#' and not defined by the user.
#' @param intercept If TRUE, an intercept is included in the basis matrix.
#' @param fx If TRUE, it removes the penalization.
#' @param S penalty matrix, defined internally if NULL (the default).
#'
#' @return A matrix with class \code{"cr"} and class \code{"basis"}.
#' @export

crSpline <- function(x, df=5, knots=NULL, intercept=FALSE, fx= FALSE, S=NULL) {
  #
  ################################################################################
  #
  nx <- names(x)
  x <- as.vector(x)
  nax <- is.na(x)
  if(nas <- any(nax)) x <- x[!nax]
  #
  # DEFINE KNOTS AND DF
  if(is.null(knots)) {
    if(df<3) stop("'df' must be >=3")
    knots <- try(hdquantile(unique(x), seq(0,1,length=df+!intercept)))
    if (class(knots)=="try.error"){knots <- quantile(unique(x), seq(0,1,length=df+!intercept))}
  } else df <- length(knots) - !intercept
  #
  # CHECK NUMBER OF UNIQUE x VALUES
  # (ADD SOME IF NEEDED TO PREVENT ERROR IN smooth.construct.cr.smooth.spec)
  if(add <- length(unique(x)) < length(knots))
    x <- c(seq(min(knots), max(knots), length=length(knots)), x)
  #
  # TRANSFORMATION: CALL FUNCTION FROM MGCV
  oo <- mgcv:::smooth.construct.cr.smooth.spec(mgcv:::s(x, bs="cr", k=df+!intercept),
                                               data=list(x=x), knots=list(x=knots))
  basis <- oo$X
  if(!intercept) basis <- basis[,-1L,drop=FALSE]
  #
  # REMOVE ADDED VALUES AND RE-INSERT MISSING
  if(add) basis <- basis[-seq(length(knots)),,drop=FALSE]
  if(nas) {
    nmat <- matrix(NA, length(nax), ncol(basis))
    nmat[!nax,] <- basis
    basis <- nmat
  }
  #
  # RELATED PENALTY MATRIX
  if(fx) {
    S <- NULL
  } else if(is.null(S)) {
    S <- oo$S[[1]]
    S <- (S+t(S))/2
    if(!intercept) S <- S[-1L,-1L,drop=FALSE]
  } else if(any(dim(S)!=ncol(basis))) stop("dimensions of 'S' not compatible")
  #
  # NAMES AND ATTRIBUTES
  dimnames(basis) <- list(nx, seq(ncol(basis)))
  attributes(basis) <- c(attributes(basis), list(df=df, knots=knots,
                                                 intercept=intercept, fx=fx, S=S))
  #
  class(basis) <- c("matrix", "cr", "basis")
  #
  return(basis)
}



#' Mean Orthogonal Projection (Combine Dimension Reducton Subspaces)
#'
#' @param p a list of symmetric projection matrices.
#' @param method whether constant weights or sqrt(1/rank) weights should be used.
#'
#' @return a list
#' @export
#' @examples
#' fit1 = SIR(bmi ~ ., diabetes[,-2], rank = 1)
#' fit2 = SIR(bmi ~ ., diabetes[,-2], rank = 3)
#' fit3 = SIR(bmi ~ ., diabetes[,-2], rank = 5)
#' fit4 = SIR(bmi ~ ., diabetes[,-2], rank = 9)
#' MOP(p = list(fit1$sirmat, fit2$sirmat, fit3$sirmat, fit4$sirmat), method = "sqrtinv")
#'
#' @references
#' Liski E., Nordhausen K., Oja H., and Ruiz-Gazen A. (2016), Combining Linear Dimension Reduction Subspaces. In: Recent Advances in
#' Robust Statistics: Theory and Applications. doi: 10.1007/9788132236436_7. \cr
#' \cr
#' Liski, E., Nordhausen, K., Oja, H., & Ruiz-Gazen, A. (2012). Averaging orthogonal projectors. https://arxiv.org/pdf/1210.2575.pdf
#'
MOP <-function(p, method = c("const", "sqrtinv")){
  method <- match.arg(method)
  if (method=="const") out = .mopwtconstant(p)
  else out = .mopwtsqrtinv(p)
  out
}

# subfunction MOP.constant
.mopwtconstant <- function(x){
  n.mat <- length(x)
  p <- dim(x[[1]])[1]
  k <- sapply(x, function(i) sum(diag(i)))
  if(any(k==0)) {
    if (all(k==0)){
      return(cat(crayon::red("All projections are degenerate. The problem is severely ill-conditioned.")))
    }
    else{
      x <- x[-which(k==0)]
      k <- k[-which(k==0)]
    }
  }
  else{
    k <- k
  }
  n.mat.new <- length(x)
  P.matbar <- Reduce("+",x) / n.mat.new
  E.P.matbar <- eigen(P.matbar)
  # Estimate of k
  k.estim <- sum(E.P.matbar$values >= 0.5)
  O.mat <- E.P.matbar$vector[,1:k.estim]
  # MOP
  P.mathat <- tcrossprod(O.mat)
  list(sdrmat = P.mathat, evectors = O.mat, evalues = E.P.matbar$values[1:k.estim], rank.est = k.estim)
}

# subfunction MOP.sq.inverse
.mopwtsqrtinv <- function(x){
  n.mat <- length(x)
  p <- dim(x[[1]])[1]
  # Individual k-estimates
  k <- sapply(x, function(i) sum(diag(i)))
  # Check for projectors with rank 0.
  if(any(k==0)) {
    if (all(k==0)){
      return(cat(crayon::red("All projections are degenerate. The problem is severely ill-conditioned.")))
    }
    else{
      x <- x[-which(k==0)]
      k <- k[-which(k==0)]
      x <- x[k]
    }
  }
  else{
    k <- k
  }
  n.mat.new <- length(x)
  x.weighted <-  mapply("/", x, sqrt(k), SIMPLIFY=FALSE)
  P.star <- Reduce("+", x.weighted) / n.mat.new
  EVD.P.star <- eigen(P.star)
  f.criterion <- cumsum(EVD.P.star$values) / sqrt(1:p)
  # Estimate of k
  k.estim <- which.max(f.criterion)
  O.mat <- EVD.P.star$vector[,1:k.estim]
  # MOP
  P.mathat <- tcrossprod(O.mat)
  list(sdrmat = P.mathat, evectors = O.mat, evalues = EVD.P.star$values[1:k.estim], k = k.estim)
}
abnormally-distributed/cvreg documentation built on May 3, 2020, 3:45 p.m.