R/cellMCD.R

Defines functions truncEig plot_cellMCD cellMCD

Documented in cellMCD plot_cellMCD

cellMCD <- function(X, alpha = 0.75, quant = 0.99, crit = 1e-4, 
                   noCits = 100, lmin = 1e-4,
                   checkPars = list()) {
  #
  # checkPars is as in cellWise::transfo(). If not coreOnly,
  # we run checkDataSet() like we did in other functions.
  #
  #
  #
  # Arguments:
  # X         : A n by d data matrix or data frame to be 
  #             analyzed. Its columns are the variables.
  # alpha     : In each column, at least n*alpha cells must
  #             remain unflagged. Defaults to 75%, should not
  #             be set (much) lower.
  # quant     : Determines the cutoff value to flag cells. 
  #             Defaults to 0.99
  # crit      : The iteration stops when successive covariance
  #             matrices (of the standardized data) differ by
  #             less than crit. Defaults to 1e-4.
  # noCits    : The maximal number of C-steps used.
  # lmin      : a lower bound on the eigenvalues of the 
  #             estimated covariance matrix on the 
  #             standardized data. Defaults to 1e-04.
  #             Should not be smaller than 1e-6.
  # lmax      : if not NULL, an upper bound on the eigenvalues 
  #             of the estimated covariance matrix on the 
  #             standardized data. Could e.g. be set to
  #             2*ncol(X) since the largest eigenvalue is at
  #             most d max_{ij} |C_{ij}| = d max_j |C_{jj}|.
  # checkPars : Optional list of parameters used in the call 
  #            to cellMCD. The options are:
  #     - coreOnly: If TRUE, skip the execution of checkDataset. 
  #         Defaults to FALSE
  #     - numDiscrete:  A column that takes on numDiscrete or 
  #         fewer values will be considered discrete and not
  #         retained in the cleaned data. Defaults to 5.
  #     - precScale: Only consider columns whose scale is larger 
  #         than precScale. Here scale is measured by the median 
  #         absolute deviation. Defaults to 1e-12.
  #     - silent: Whether or not the function progress messages 
  #         should be printed. Defaults to FALSE.
  
  # First some auxiliary functions that are only used here, so 
  # they don't show up in the list of functions:
  
  lmax <- NULL # upper bound on maximum eigenvalue. not used
  
  Cstep <- function(X, W, mu, Sigma, Sigmai, lambdas, h,
                   precScale=1e-12){
    # Performs the C-step for cellMCD.
    # Part 1 updates W for fixed mu and Sigma,
    # Part 2 updates mu and Sigma for fixed W.
    #
    dims <- dim(X)
    n    <- dims[1]
    d    <- dims[2]
    #
    # Part 1: start by updating W. 
    W <- updateW_cpp(X = X, W = W, mu = mu,
                     Sigma = Sigma, Sigmai = Sigmai,
                     lambda = lambdas, h = h)
    
    
    # Part 2: now execute one EM-step. 
    #
    # First take one E-step.
    # This can be done faster: we should only need to invert 
    # a single matrix here!
    # We could also loop over patterns instead of rows.
    #
    Ximp <- X # will contain suff stats for estimating mu
    bias <- matrix(0, d, d) # more suff stats for estimating Sigma
    #
    for (i in 1:n) { # loops over rows, not patterns
      mis <- which(W[i, ] == 0)
      obs <- which(W[i, ] == 1)
      x <- X[i, ]
      if (length(mis) > 0)  {
        if (length(mis) == d) { # whole row is missing
          ximp <- mu # just plug in old mu
          bias <- bias + Sigma # "bias" is updated by old Sigma
        } else{
          ximp <- x
          Sigmai_temp <- solve(Sigmai[mis, mis]) 
          ximp[mis] <- mu[mis] - Sigmai_temp %*%
            Sigmai[mis, obs, drop = F] %*%
            t(X[i, obs, drop = F] - mu[obs]) 
          bias[mis, mis] <- bias[mis, mis] + Sigmai_temp
        }
        Ximp[i, ] <- ximp
      }
    }
    #
    # Now one M-step, with the same formulas as if the sufficient 
    # statistics Ximp and "bias" came from complete data:
    #
    mu    <- colMeans(Ximp)
    bias  <- bias / n
    Sigma <- cov(Ximp) * (n - 1) / n + bias
    return(list(W = W, mu = mu, Sigma = Sigma))
  }
  
  iterMCD <- function(X,
                     initEst,
                     alpha=0.75,
                     lambdas,
                     precScale=1e-12,
                     crit=1e-4,
                     noCits=100,
                     lmin = NULL,
                     lmax = NULL,
                     silent) {
    #
    if (!is.list(initEst)) stop("initEst should be a list") 
    h <- ceiling(alpha * dim(X)[1])
    p <- ncol(X)
    n <- nrow(X)
    
    mu     <- initEst$mu
    Sigma  <- initEst$Sigma
    Sigmai <- mpinv(Sigma)$Inv
    if (is.null(initEst$W)) { W <- matrix(1, n, p)
    } else { W <- initEst$W }
    W[is.na(X)] <- 0 # to deal with NA's in data
    
    convcrit <- 1
    nosteps  <- 0
    objvals  <- rep(NA, noCits + 1)
    penalty  <- sum(lambdas * colSums(1 - W))
    objvals[nosteps + 1] <- Objective_cpp(X, W, mu, Sigma, Sigmai) + penalty
    if (!silent) cat(paste0("\nObjective at step ", nosteps, " = ",
                           round(objvals[nosteps + 1], 5), "\n"))
    prevW     <- W
    prevmu    <- mu
    prevSigma <- Sigma
    
    while (convcrit > crit && nosteps < noCits) {
      Cresult  <- Cstep(X = X, W = W, mu = mu, Sigma = Sigma,
                       Sigmai = Sigmai, lambdas = lambdas, h = h)
      convcrit <- max(abs((Cresult$Sigma - Sigma))) 
      W       <- Cresult$W
      mu      <- Cresult$mu
      Sigma   <- Cresult$Sigma
      Sigma   <- truncEig(Sigma, lmin, lmax)
      Sigmai  <- solve(Sigma)
      penalty <- sum(lambdas * colSums(1 - W))
      objvl   <- Objective_cpp(X, W, mu, Sigma, Sigmai) + penalty 
      if (objvl > objvals[nosteps + 1]) { 
        return(list(W = prevW, mu = prevmu, Sigma = prevSigma,
                    nosteps = nosteps))
      }
      if (!silent) cat(paste0("Objective at step ", nosteps + 1, " = ",
                              round(objvl, 5), "\n"))
      nosteps <- nosteps + 1
      objvals[nosteps+1] <- objvl
      prevW     <- W
      prevmu    <- mu
      prevSigma <- Sigma
    }
    return(list(W = W, mu = mu, Sigma = Sigma, nosteps = nosteps))
  }
  
  # Here the main function starts:
  #
  X <- as.matrix(X) 
  if (!"coreOnly" %in% names(checkPars)) {
    checkPars$coreOnly <- FALSE
  }
  if (!"numDiscrete" %in% names(checkPars)) {
    checkPars$numDiscrete <- 5
  }
  if (!"precScale" %in% names(checkPars)) {
    checkPars$precScale <- 1e-12
  }
  if (!"silent" %in% names(checkPars)) {
    checkPars$silent <- FALSE
  }
  
  if (!"fracNA" %in% names(checkPars)) {
    checkPars$fracNA <- 0.5
  }
  
  if (!checkPars$coreOnly) {
    X <- checkDataSet(X, fracNA = 0.5, 
                      numDiscrete = checkPars$numDiscrete, 
                      precScale = checkPars$precScale, 
                      silent = checkPars$silent)$remX
  }
  # check dimension of the data w.r.t. the sample size
  if (nrow(X) < 5 * ncol(X)) {
    warning(paste0("There are fewer than 5 cases per dimension",
                   " in this data set.\n",
                   "It is not recommended to run cellMCD",
                   " on these data.\n",
                   "Consider reducing the number of variables."))
  }
  if (lmin < 1e-6) stop("lmin should be at least 1e-6.")
  
  #
  # Robustly standardize the data
  #
  locsca  <- cellWise::estLocScale(X)
  rscales <- locsca$scale
  Xs      <- scale(X, center = locsca$loc, scale = rscales)
  #
  # Check whether there are not too many bad cells:
  #
  margfrac <- colSums(abs(Xs) > sqrt(qchisq(0.99,1)), na.rm = TRUE) / nrow(Xs)
  if (max(margfrac) > 1 - alpha) {
    cat(paste0("\nAt least one variable of X has more than ",
               "100*(1-alpha)% = ", 100 * (1 - alpha), "%",
               "\nof marginal outliers.",
               "\nThe percentages per variable are:\n"))
    print(round(100 * margfrac, 2))
    stop("Too many marginal outliers.")
  }
  badfrac <- colMeans((abs(Xs) > sqrt(qchisq(0.99,1))) | (is.na(Xs))) # also includes real NAs in X
  if (max(badfrac) > 1 - alpha) {
    cat(paste0("\nAt least one variable of X has more than ",
               "100*(1-alpha)% = ", 100 * (1 - alpha), "%",
               "\nof marginal outliers plus NA's.",
               "\nThe percentages per variable are:\n"))
    print(round(100 * badfrac, 2))
    stop("Too many marginal outliers plus NA's.")
  }
  #
  DDCWout <- DDCWcov(Xs, maxCol = 1 - alpha, lmin = lmin, lmax = lmax)
  initEst <- list(mu = DDCWout$center, Sigma = DDCWout$cov)
  #
  # We set the marginal outliers to NA so they stay flagged:
  Xs[abs(Xs) > 3] = NA
  #
  logC    <- -log(diag(solve(initEst$Sigma)))
  cutoff  <- qchisq(quant, df = 1)
  lambdas <- cutoff + logC + log(2 * pi)
  temp    <- iterMCD(Xs, initEst = initEst, alpha = alpha, lambdas = lambdas, 
                     precScale = checkPars$precScale, crit = crit, 
                     noCits = noCits,  
                     lmin = lmin, lmax = lmax, 
                     silent = checkPars$silent)
  W <- temp$W
  rownames(W) <- rownames(X)
  colnames(W) <- colnames(X)
  out   <- allpreds_cpp(Xs, temp$Sigma, temp$mu, W)
  preds <- out$preds
  if (sum(is.na(as.vector(preds))) > 0 ) stop("There are missing preds!")
  cvars <- out$cvars
  if (sum(is.na(as.vector(cvars))) > 0 ) stop("There are missing cvars!") 
  if (min(as.vector(cvars)) <= 0) stop("There are cvars <= 0 !")
  rownames(preds) <- rownames(cvars) <- rownames(X)
  colnames(preds) <- colnames(cvars) <- colnames(X)
  
  if (!checkPars$silent) {
    percflag <- 100 * colMeans(1 - W, na.rm = TRUE)
    cat("Percentage of flagged cells per variable:\n")
    print(round(percflag,2))
  }
  mu    <- locsca$loc + temp$mu * rscales
  raw.S <- diag(rscales) %*% temp$Sigma %*% diag(rscales)
  S <- diag(rscales) %*% cov2cor(temp$Sigma) %*% diag(rscales)
  colnames(S) = rownames(S) = colnames(raw.S) = rownames(raw.S) = colnames(X)
  
  preds <- scale(preds, center = FALSE, scale = 1 / rscales)
  preds <- scale(preds, center = -locsca$loc, scale = FALSE)
  csds  <- scale(sqrt(cvars), center = FALSE, scale = 1 / rscales)
  Ximp  <- X
  Ximp[which(W == 0)] <- preds[which(W == 0)]
  Zres  <- (X - preds) / csds
  return(list(mu = mu, S = S,
              W = W, preds = preds,
              csds = csds, Ximp = Ximp,
              Zres = Zres, raw.S = raw.S,
              locsca = locsca,
              nosteps = temp$nosteps,
              X = X, quant = quant))
} 


plot_cellMCD = function(cellout, type = "Zres/X", whichvar = NULL,
                        horizvar = NULL, vertivar = NULL,  
                        hband = NULL, vband = NULL, drawellipse = T,
                        opacity = 0.5, identify = FALSE, 
                        ids=NULL, labelpoints = T, vlines = FALSE,
                        clines = TRUE, main = NULL,
                        xlab = NULL, ylab = NULL, xlim = NULL,
                        ylim = NULL, cex = 1, cex.main = 1.2, 
                        cex.txt = 0.8, cex.lab = 1, line=2.0){
  #
  # Function for making plots based on cellMCD output.
  #
  # Arguments:
  # cellout      output of function cellMCD()
  # type         "index", "Zres/X", "Zres/pred", "X/pred", or
  #              "bivariate".
  # whichvar     number or name of the variable to be plotted.  
  #              Not applicable when type == "bivariate".
  # horizvar     number or name of the variable to be plotted on the 
  #              horizontal axis. Only when type == "bivariate".
  # vertivar     number or name of the variable to be plotted on the 
  #              vertical axis. Only when type == "bivariate".
  # hband        draw a horizontal tolerance band? TRUE or FALSE. 
  #              NULL yields TRUE for types "index", "Zres/X", 
  #              and "Zres/pred".
  # vband        draw a vertical tolerance band? TRUE or FALSE.
  #              NULL yields TRUE for types "Zres/X", "Zres/pred", 
  #              and "X/pred".
  # drawellipse  whether to draw a 99% tolerance ellipse. Only
  #              for type == "bivariate".
  # opacity      opacity of the plotted points: 1 is fully opaque,
  #              less is more transparent.
  # identify     if TRUE, identify cases by mouseclick, then Esc.
  # ids          vector of case numbers to be emphasized in the plot.
  #              If NULL or of length zero, none are emphasized.
  # labelpoints  if TRUE, labels the points in ids by their
  #              row name in X.
  # vlines       for the points in ids, draw dashed vertical lines 
  #              from their standardized residual to 0 when type is
  #              "index", "Zres/X", or "Zres/pred". Draws dashed
  #              vertical ines to the diagonal for type "X/pred". 
  #              Can be TRUE or FALSE, default is FALSE.
  # clines       only for type == "bivariate". If TRUE, draws
  #              a red connecting line from each point in ids to 
  #              its imputed point, shown in blue.
  #
  # The following arguments are plot options, to finalize plots
  # for presentation:
  #
  # main         main title of the plot. If NULL, it is constructed
  #              automatically from the arguments.  
  # xlab         overriding label for x-axis, unless NULL.
  # ylab         overriding label for y-axis, unless NULL.
  # xlim         overriding limits of horizontal axis.
  # ylim         overriding limits of vertical axis.
  # cex          size of plotted points.
  # cex.main     size of the main title.
  # cex.lab      size of the axis labels.
  # cex.txt      size of the point labels.
  # line         distance of axis labels to their axis.
  #
  # Invisible output:
  # out      NULL, except when identify == TRUE. Then a list with:
  #          $ids    : the case number(s) that were identified
  #          $coords : coordinates of all points in the plot.
  
  # First some auxiliary functions:
  #
  identfy = function(xcoord,ycoord){
    # identify points in a plot(x,y)
    coordinates <- cbind(xcoord,ycoord)
    message("Press the escape key to stop identifying.")
    iout <- identify(coordinates, order = TRUE)
    ids <- iout$ind[order(iout$order)]
    if (length(ids) > 0) {
      cat("Identified point(s): ")
      print(ids)
    }   
    return(list(ids=ids, coords=coordinates[ids,,drop=F]))
  }
  
  addlabels = function(xcoord, ycoord, ids, labs, cex.txt = 0.8,
                       labtype = NULL){
    # Adds labels to plot. When labs = "i" it is the case number.
    # Also, labs can be the set of row names of the dataset.
    # For labs = "letters" we plot a, b, c, ...  
    #
    len = length(ids)
    if(len == 0) stop("ids has no elements")
    if(is.null(labtype)) { mylabs = labs[ids] 
    } else {
      if(labtype == "i") mylabs = ids
      if(labtype == "letters") mylabs = letters[seq_len(len)]
    }
    text(x = xcoord[ids], y = ycoord[ids], labels = mylabs, 
         cex = cex.txt)
  }
  
  vlines2diag = function(xcoord, ycoord, ids, lty=2, col="red"){
    # For indices in isd, plot vertical residual line 
    for(i in seq_len(length(ids))){ # i=2
      xys = c(xcoord[ids[i]],xcoord[ids[i]],
              ycoord[ids[i]],xcoord[ids[i]])
      lines(matrix(xys, ncol = 2, byrow=F), lty=lty, col=col)
    }
  }
  
  vlines2zero = function(xcoord, ycoord, ids, lty=2, col="red"){
    # For indices in isd, plot vertical residual line 
    for(i in seq_len(length(ids))){ # i=2
      xys = c(xcoord[ids[i]],xcoord[ids[i]],
              ycoord[ids[i]],0)
      lines(matrix(xys, ncol = 2, byrow=F), lty=lty, col=col)
    }
  }
  
  # Replaced ellipse::ellipse() by the simple function below,
  # so we do not have a dependency on library(ellipse).
  ellipsepoints = function(covmat, mu, quant=0.99, npoints = 100)
  { # computes points of the ellipse t(x-mu)%*%covmat%*%(x-mu) = c
    # with c = qchisq(quant,df=2)
    if (!all(dim(covmat) == c(2, 2))) stop("covmat is not 2 by 2")
    eig = eigen(covmat)
    U = eig$vectors
    R = U %*% diag(sqrt(eig$values)) %*% t(U) # square root of covmat
    angles = seq(0, 2*pi, length = npoints+1)
    xy = cbind(cos(angles),sin(angles)) # points on the unit circle
    fac = sqrt(qchisq(quant, df=2))
    scale(fac*xy%*%R, center = -mu, scale=FALSE)
  }  
  
  # Here the main function starts:
  #
  cutf = sqrt(qchisq(cellout$quant, df=1))
  mycol <- adjustcolor("black", alpha.f = opacity)
  redcol <- adjustcolor("red", alpha.f = 1) # = opacity)
  X = as.matrix(cellout$X)
  if(length(dim(X)) != 2) stop("cellout$X is not a matrix")
  ncol = ncol(X)
  nrow = nrow(X)
  cn = colnames(X)
  if(is.null(cn)) cn = seq_len(ncol)
  rn = rownames(X)
  if(is.null(rn)) rn = seq_len(nrow)
  if(type %in% c("index", "Zres/X", "Zres/pred", "X/pred")){
    if(is.null(whichvar)){
      stop("You must specify the variable whichvar to plot.")
    }
    if(whichvar %in% seq_len(ncol)){
      j = whichvar
      varlab = cn[j] 
    } else { 
      if(whichvar %in% cn){
        j = which(cn == whichvar)
        varlab = whichvar
      } else { stop(paste0("whichvar = ",whichvar," is not valid")) }
    }
    Xj     = X[,j]
    preds  = cellout$preds[,j]
    
    Zres = cellout$Zres[, j]
    flagged = which(abs(Zres) > cutf)
    #
    if(type == "index"){
      ycoord = Zres 
      xcoord = seq_len(length(ycoord))
      hlim = c(-cutf, cutf)
      obsp = which(!is.na(ycoord)) # points to plot
      if(is.null(ylim)) ylim = range(c(ycoord[obsp],hlim), na.rm=T)    
      plot(xcoord[obsp], ycoord[obsp], xlim=xlim, ylim=ylim, pch=16, 
           xlab="", ylab="", col=mycol, cex=cex) 
      if(is.null(xlab)) xlab = "index"
      title(xlab = xlab, line=line, cex.lab = cex.lab)
      if(is.null(ylab)) ylab = paste0("standardized residual of ",varlab)
      title(ylab = ylab, line=line, cex.lab = cex.lab)
      if(is.null(main)) main = 
        paste0("index plot: standardized residual of ",varlab)
      title(main = main, line = 1, cex.main = cex.main)
      if(is.null(hband) || hband==T){
        abline(h = hlim, lwd = 3, col="darkgray") }
      # in an index plot we cannot draw a meaningful vband.
      points(xcoord[flagged], ycoord[flagged], pch=16, col=redcol)
      if(length(ids) > 0){
        if(labelpoints) {
          addlabels(xcoord, ycoord, ids, labs = rn, cex.txt = cex.txt)
        }
        if(vlines == TRUE){
          vlines2zero(xcoord, ycoord, ids, lty=2, col="red")
        }
      }
    }
    #
    if(type == "Zres/X"){
      xcoord = Xj
      ycoord = Zres
      lsx  = cellWise::estLocScale(xcoord) 
      vlim = c(lsx$loc - cutf*lsx$scale, lsx$loc + cutf*lsx$scale)
      hlim = c(-cutf, cutf)
      obsp = which(!is.na(xcoord) & !is.na(ycoord)) # points to plot
      if(is.null(xlim)) xlim = range(c(xcoord[obsp],vlim), na.rm=T)
      if(is.null(ylim)) ylim = range(c(ycoord[obsp],hlim), na.rm=T)    
      plot(xcoord[obsp], ycoord[obsp], xlim=xlim, ylim=ylim, pch=16, 
           xlab="", ylab="", col=mycol, cex=cex) 
      if(is.null(xlab)) xlab=varlab
      title(xlab = xlab, line=line, cex.lab = cex.lab)
      if(is.null(ylab)) ylab = paste0("standardized residual of ",varlab)
      title(ylab = ylab, line=line, cex.lab = cex.lab)
      if(is.null(main)) main = paste0("standardized residual versus X for ",varlab)
      title(main = main,line = 1, cex.main = cex.main) 
      if(is.null(hband) || hband==T){
        abline(h = hlim, lwd = 3, col="darkgray") }  
      if(is.null(vband) || vband==T){
        abline(v = vlim, lwd = 3, col="darkgray") }
      points(xcoord[flagged], ycoord[flagged], pch=16, col=redcol)
      if(length(ids) > 0){
        if(labelpoints) {      
          addlabels(xcoord, ycoord, ids, labs = rn, cex.txt = cex.txt)
        }
        if(vlines == TRUE){
          abline(h=0) 
          vlines2zero(xcoord, ycoord, ids, lty=2, col="red") 
        }
      }
    }
    #
    if(type == "Zres/pred"){  
      xcoord = preds
      ycoord = Zres
      lsx  = cellWise::estLocScale(xcoord) 
      vlim = c(lsx$loc - cutf*lsx$scale, lsx$loc + cutf*lsx$scale)
      hlim = c(-cutf, cutf)
      obsp = which(!is.na(xcoord) & !is.na(ycoord)) # points to plot
      if(is.null(xlim)) xlim = range(c(xcoord[obsp],vlim), na.rm=T)
      if(is.null(ylim)) ylim = range(c(ycoord[obsp],hlim), na.rm=T)    
      plot(xcoord[obsp], ycoord[obsp], xlim=xlim, ylim=ylim, pch=16, 
           xlab="", ylab="", col=mycol, cex=cex) 
      if(is.null(xlab)) xlab = paste0("predicted ",varlab)
      title(xlab=xlab, line=line, cex.lab = cex.lab)
      if(is.null(ylab)) ylab = paste0("standardized residual of ",varlab)
      title(ylab = ylab, line=line, cex.lab = cex.lab)
      if(is.null(main)) main = paste0(
        "standardized residual versus prediction for ",varlab)
      title(main = main,line = 1, cex.main = cex.main)
      if(is.null(hband) || hband==T){
        abline(h = hlim, lwd = 3, col="darkgray") }  
      if(is.null(vband) || vband==T){
        abline(v = vlim, lwd = 3, col="darkgray") }
      points(xcoord[flagged], ycoord[flagged], pch=16, col=redcol)
      if(length(ids) > 0){
        if(labelpoints) {
          addlabels(xcoord, ycoord, ids, labs = rn, cex.txt = cex.txt)
        }
        if(vlines == TRUE){
          abline(h=0) 
          vlines2zero(xcoord, ycoord, ids, lty=2, col="red") 
        }
      }
    }
    #
    if(type == "X/pred"){
      xcoord = preds
      ycoord = Xj
      lsx  = cellWise::estLocScale(xcoord) 
      lsy  = cellWise::estLocScale(ycoord)
      vlim = c(lsx$loc - cutf*lsx$scale, lsx$loc + cutf*lsx$scale)
      hlim = c(lsy$loc - cutf*lsy$scale, lsy$loc + cutf*lsy$scale)
      obsp = which(!is.na(xcoord) & !is.na(ycoord)) # points to plot
      if(is.null(xlim)) xlim = range(c(xcoord[obsp],vlim), na.rm=T)
      if(is.null(ylim)) ylim = range(c(ycoord[obsp],hlim), na.rm=T)    
      plot(xcoord[obsp], ycoord[obsp], xlim=xlim, ylim=ylim, pch=16, 
           xlab="", ylab="", col=mycol, cex=cex) 
      if(is.null(xlab)) xlab = paste0("predicted ",varlab)
      title(xlab = xlab, line=line, cex.lab = cex.lab)
      if(is.null(ylab)) ylab = paste0("observed ",varlab)
      title(ylab = ylab, line=line, cex.lab = cex.lab)
      if(is.null(main)) main = paste0(varlab," versus its prediction")
      title(main = main,line = 1, cex.main = cex.main)
      abline(0,1)
      if(!is.null(hband) && hband == TRUE){
        abline(h = hlim, lwd = 3, col="darkgray") }
      if(!is.null(vband) && vband == TRUE){
        abline(v = vlim, lwd = 3, col="darkgray") }
      points(xcoord[flagged], ycoord[flagged], pch=16, col=redcol)
      if(length(ids) > 0){
        if(labelpoints) {
          addlabels(xcoord, ycoord, ids, labs = rn, cex.txt = cex.txt)
        }
        if(vlines == TRUE){
          vlines2diag(xcoord, ycoord, ids, lty=2, col="red")
        }   
      }
    }
  } else {
    if(type == "bivariate"){
      if(is.null(horizvar)) {
        stop(paste0("You must specify the variable horizvar\n",
                    "to plot on the horizontal axis.")) }
      if(is.null(vertivar)) {
        stop(paste0("You must specify the variable verticar\n",
                    "to plot on the vertical axis.")) }
      if(horizvar %in% seq_len(ncol)){
        jj = horizvar
        hlab = cn[jj] 
      } else { 
        if(horizvar %in% cn){
          jj = which(cn == horizvar)
          hlab = horizvar
        } else { stop(paste0("horizvar = ",horizvar," is not valid")) }
      }
      if(vertivar %in% seq_len(ncol)){
        kk = vertivar
        vlab = cn[kk] 
      } else { 
        if(vertivar %in% cn){
          kk = which(cn == vertivar)
          vlab = vertivar
        } else { stop(paste0("vertivar = ",vertivar," is not valid")) }
      }
      if(jj==kk) stop("horivar and vertivar should differ.")
      xcoord = X[,jj]
      ycoord = X[,kk]
      imp = cellout$Ximp
      ell = ellipsepoints(covmat = cellout$S[c(jj,kk),c(jj,kk)],
                          mu = cellout$mu[c(jj,kk)], quant=0.99,
                          npoints=500)
      lsx = cellWise::estLocScale(xcoord) 
      lsy = cellWise::estLocScale(ycoord)
      vlim = c(lsx$loc - cutf*lsx$scale, lsx$loc + cutf*lsx$scale)
      hlim = c(lsy$loc - cutf*lsy$scale, lsy$loc + cutf*lsy$scale)
      obsp = which(!is.na(xcoord) & !is.na(ycoord)) # points to plot
      if(is.null(xlim)) {
        xlim = range(c(xcoord[obsp],imp[obsp,jj],ell[,1],vlim), na.rm=T)
      }
      if(is.null(ylim)) {
        ylim = range(c(ycoord[obsp],imp[obsp,kk],ell[,2],hlim),na.rm=T)
      }
      flagged = which(cellout$W[,jj]*cellout$W[,kk] == 0) 
      plot(xcoord[obsp], ycoord[obsp], xlim=xlim, ylim=ylim, pch=16, 
           xlab="", ylab="", col=mycol, cex=cex) 
      if(is.null(xlab)) xlab = hlab
      title(xlab = xlab, line=line, cex.lab = cex.lab)
      if(is.null(ylab)) ylab = vlab
      title(ylab = ylab, line=line, cex.lab = cex.lab)
      if(is.null(main)) main = paste0(vlab," versus ",hlab)
      title(main = main,line = 1, cex.main = cex.main)
      if(is.null(hband)) hband = F
      if(hband == T) abline(h = hlim, lwd=3, col="darkgray")
      if(is.null(vband)) vband = F
      if(vband == T) abline(v = vlim, lwd=3, col="darkgray")
      if(drawellipse) lines(ell, lwd=3, col="darkgray")
      points(xcoord[flagged], ycoord[flagged], pch=16, col=redcol)
      if(length(ids) > 0){
        if(labelpoints) {
          addlabels(xcoord, ycoord, ids, labs = rn, cex.txt = cex.txt)
        }
        if(is.null(clines) || clines==T){
          imp = cellout$Ximp
          for(i in ids) {
            if(i %in% obsp){
              lines(x=c(X[i,jj],imp[i,jj]),y=c(X[i,kk],imp[i,kk]), 
                    lty=1, col="red")
              points(x=imp[i,jj],y=imp[i,kk],pch=16,col="blue")
            }
          }
        }   
      }
    } else  stop(paste0("type = \"",type,"\" is invalid"))
  }
  if(identify) invisible(identfy(xcoord,ycoord))
}





truncEig <- function(S, lmin = NULL, lmax = NULL) {
  # Truncates the eigenvalues of S to between lmin and lmax.
  nrS <- nrow(S)
  if (ncol(S) != nrS) stop(" S is not square")
  if (mean(as.vector(abs(S - t(S)))) > 1e-8) {
    stop(" S is not symmetric")
  }
  Sout <- S
  if (!(is.null(lmin) && is.null(lmax))) {
    if (!is.null(lmin)) {
      if (lmin < 0) stop(paste0("lmin = ",lmin," must be >= 0"))
      if (!(lmin < Inf)) stop(paste0("lmin = ",lmin," must be finite"))
    }
    if (!is.null(lmax)) {
      if (lmax < 0) stop(paste0("lmax = ",lmax," must be >= 0"))
      if (!(lmax < Inf)) stop(paste0("lmax = ",lmax," must be finite"))
    }
    eig <- eigen(0.5 * (S + t(S)))
    vals <- eig$values
    if (!is.null(lmin)) { # there is a lower bound
      vals <- pmax(vals, lmin) 
    }
    if (!is.null(lmax)) { # there is an upper bound
      vals <- pmin(vals, lmax) 
    }    
    Sout <- eig$vectors %*% diag(vals) %*% t(eig$vectors) 
  } 
  return(Sout)
}

Try the cellWise package in your browser

Any scripts or data that you put into this service are public.

cellWise documentation built on Oct. 25, 2023, 5:07 p.m.