clever: Calculate Leverage of Component Models

Documented in clever

#' Identify outliers with \code{clever}
#' 
#' Calculates PCA leverage or robust distance and identifies outliers.
#'
#' \code{clever} will use all combinations of the requested projection and 
#'  out_meas methods that make sense. For example, if  
#'  \code{projection=c("PCATF", "PCA_var", "PCA_kurt")} and 
#'  \code{out_meas=c("leverage", "robdist")} then these five
#'  combinations will be used: PCATF with leverage, PCA + variance with 
#'  leverage, PCA + variance with robust distance, PCA + kurtosis with leverage,
#'  and PCA + kurtosis with robust distance. Each method combination will yield 
#'  its own out_meas time series.
#' 
#' @param X Numerical data matrix. Should be wide (N observations x P variables, 
#'  \eqn{N >> P}).
#' @param projection Character vector indicating the projection methods
#'  to use. Choose at least one of the following: \code{"PCA_var"} for 
#'  PCA + variance, \code{"PCA_kurt"} for PCA + kurtosis, and \code{"PCATF"} for
#'  PCA Trend Filtering + variance. Or, use \code{"all"} to use all projection 
#'  methods. Default: \code{c("PCA_kurt")}.
#' @param out_meas Character vector indicating the outlyingness measures to 
#'  compute. Choose at least one of the following: \code{"leverage"} for 
#'  leverage, or \code{"robdist"} for robust distance. Or, use \code{"all"} 
#'  to use both methods. Default: \code{c("leverage")}.
#' @param DVARS Should DVARS (Afyouni and Nichols, 2017) be computed too? Default 
#'  is \code{TRUE}.
#' @param detrend_PCs Detrend all PCs before computing leverage or robust 
#'  distance? Default: \code{TRUE}.
#' 
#'  Detrending is recommended for time-series data, especially if there are many
#'  time points or changing circumstances, such as in task-based fMRI. 
#'  Detrending should not be used with non-time-series data because the 
#'  observations are not temporally related.
#' @param PCATF_kwargs Named list of arguments for PCATF projection method.
#'  Only applies if \code{("PCATF" \%in\% projection)}.
#' 
#'  Valid entries are: 
#'  
#'  \describe{
#'    \item{K}{maximum number of PCs to compute (Default: \code{1000})}
#'    \item{lambda}{trend filtering parameter (Default: \code{0.05})}
#'    \item{niter_max}{maximum number of iterations (Default: \code{1000})}
#'    \item{verbose}{Print updates? (Default: \code{FALSE})}
#'  }
#' @param kurt_quantile What cutoff quantile for kurtosis should be used? Only 
#'  applies if \code{("PCA_kurt" \%in\% projection)}. Default: \code{0.95}.
#' @param kurt_detrend Should the PCs be detrended before measuring kurtosis? 
#'  Only applies if \code{("PCA_kurt" \%in\% projection)}. Default: \code{TRUE}. 
#' 
#'  Detrending is highly recommended for time-series data, because trends can 
#'  induce high kurtosis even in the absence of outliers. Detrending should not
#'  be done with non-time-series data because the observations are not 
#'  temporally related.
#' @param id_outliers Should the outliers be identified? Default: \code{TRUE}.
#' @param lev_cutoff The outlier cutoff value for leverage, as a multiple of the
#'  median leverage. Only used if 
#'  \code{"leverage" \%in\% projection} and \code{id_outliers}. Default: 
#'  \code{4}, or \eqn{4 * median}.
#' @param rbd_cutoff The outlier cutoff quantile for MCD distance. Only used if 
#'  \code{"robdist" \%in\% projection} and \code{id_outliers}. Default: 
#'  \code{0.9999}, for the \eqn{0.9999} quantile.
#' 
#'  The quantile is computed from the estimated F distribution.
#' @param lev_images Should leverage images be computed? If \code{FALSE} memory
#'  is conserved. Default: \code{FALSE}.
#' @param verbose Should occasional updates be printed? Default: \code{FALSE}.
#'
#' @return A clever object, i.e. a list with components
#' \describe{
#'  \item{params}{A list of all the arguments used.}
#'  \item{projections}{
#'    \describe{
#'      \item{PC_var}{
#'        \describe{
#'          \item{indices}{The indices retained from the original SVD 
#'            projection to make the variance-based PC projection.} 
#'          \item{PCs}{The PC projection.}  
#'        }
#'      }
#'      \item{PC_kurt}{
#'        \describe{
#'          \item{indices}{The indices retained from the original SVD 
#'            projection to make the kurtosis-based PC projection. They are 
#'            ordered from highest kurtosis to lowest kurtosis.}  
#'          \item{PCs}{The PC projection. PCs are ordered in the standard
#'            way, from highest variance to lowest variance, instead of by 
#'            kurtosis.}  
#'        }
#'      }
#'      \item{PCATF}{
#'        \describe{
#'          \item{indices}{The indices of the trend-filtered PCs used to make the
#'            projection.}  
#'          \item{PCs}{The PCATF result.}  
#'        }
#'      }
#'    }
#'  }
#'  \item{outlier_measures}{
#'    \describe{
#'      \item{PC_var__lev}{The leverage values for the PC_var projection.}
#'      \item{PC_kurt__lev}{The leverage values for the PC_kurt projection.}
#'      \item{PCATF__lev}{The leverage values for the PCATF projection.}
#'      \item{PC_var__rbd}{The robust MCD distance values for the PC_var projection.}
#'      \item{PC_kurt__rbd}{The robust MCD distance values for the PC_kurt projection.}
#'      \item{DVARS_DPD}{The Delta percent DVARS values.}
#'      \item{DVARS_ZD}{The DVARS z-scores.}
#'    }
#'  }
#'  \item{outlier_cutoffs}{
#'    \describe{
#'      \item{lev}{The leverage cutoff for outlier detection: \code{lev_cutoff} times
#'        the median leverage.}
#'      \item{MCD}{The robust distance cutoff for outlier detection: the 
#'        \code{rbd_cutoff} quantile of the estimated F distribution.}
#'      \item{DVARS_DPD}{The Delta percent DVARS cutoff: +/- 5 percent}
#'      \item{DVARS_ZD}{The DVARS z-score cutoff: the one-sided 5 percent 
#'        significance level with Bonferroni FWER correction.}
#'    }
#'  }
#'  \item{outlier_flags}{
#'    \describe{
#'      \item{PC_var__leverage}{Logical vector idnicating whether each observation surpasses the outlier cutoff.}
#'      \item{PC_kurt__leverage}{Logical vector idnicating whether each observation surpasses the outlier cutoff.}
#'      \item{PCATF__leverage}{Logical vector idnicating whether each observation surpasses the outlier cutoff.}
#'      \item{PC_var__robdist}{Logical vector idnicating whether each observation surpasses the outlier cutoff.}
#'      \item{PC_kurt__robdist}{Logical vector idnicating whether each observation surpasses the outlier cutoff.}
#'      \item{DVARS_DPD}{Logical vector idnicating whether each observation surpasses the outlier cutoff.}
#'      \item{DVARS_ZD}{Logical vector idnicating whether each observation surpasses the outlier cutoff.}
#'    }
#'  }
#'  \item{robdist_info}{
#'    \describe{
#'      \item{PC_var__robdist}{
#'        \describe{
#'          \item{inMCD}{Logical vector indicating whether each observation was in the MCD estimate.}
#'          \item{outMCD_scale}{The scale for out-of-MCD observations.}
#'          \item{Fparam}{Named numeric vector: c, m, df1, and df2.}
#'        }
#'      }
#'      \item{PC_var__robdist}{
#'        \describe{
#'          \item{inMCD}{Logical vector indicating whether each observation was in the MCD estimate.}
#'          \item{outMCD_scale}{The scale for out-of-MCD observations.}
#'          \item{Fparam}{Named numeric vector: c, m, df1, and df2.}
#'        }
#'      }
#'    }
#'  }
#'  \item{MCD_scale}{The scale value for out-of-MCD observations, and NA for
#'    in-MCD observations. NULL if \code{method} is not robust distance.}
#'  \item{lev_images}{
#'    \describe{
#'      \item{mean}{The average of the PC directions, weighted by the unscaled
#'        PC scores at each outlying time point (U[i,] * V^T). Row names are
#'        the corresponding time points.}
#'      \item{top}{The PC direction with the highest PC score at each outlying
#'        time point. Row names are the corresponding time points.}
#'      \item{top_dir}{The index of the PC direction with the highest PC score
#'        at each outlying time point. Named by timepoint.}
#'    }
#'  }
#' }
#'
#' @importFrom robustbase rowMedians
#' @import stats
#'
#' @export
#'
#' @examples
#' n_voxels = 1e4
#' n_timepoints = 100
#' X = matrix(rnorm(n_timepoints*n_voxels), ncol = n_voxels)
#'
#' clev = clever(X)
clever = function(
  X,
  projection = "PCA_kurt",
  out_meas = "leverage",
  DVARS = TRUE,
  detrend_PCs = TRUE,
  PCATF_kwargs = NULL,
  kurt_quantile = .95,
  kurt_detrend = TRUE,
  id_outliers = TRUE,
  lev_cutoff = 4,
  rbd_cutoff = 0.9999,
  lev_images = FALSE,
  verbose = FALSE) {

  # ----------------------------------------------------------------------------
  # Check arguments. -----------------------------------------------------------
  # ----------------------------------------------------------------------------

  # Define the valid `projection` and `out_meas`, and their valid combos.
  all_projection <- c("PCA_var", "PCA_kurt", "PCATF")
  all_out_meas <- c("leverage", "robdist")
  all_valid_methods <- c(
    c("PCA_var__leverage", "PCA_kurt__leverage", 
      "PCA_var__robdist", "PCA_kurt__robdist", 
      "PCATF__leverage")
  )

  # Define the cutoff value for detecting zero variance/MAD voxels
  TOL <- 1e-8

  # Check arguments.
  if(!is.matrix(X)){ X <- as.matrix(X) }
  class(X) <- "numeric"

  if(identical(projection, "all")){
    projection <- all_projection
  } else {
    projection <- match.arg(projection, all_projection, several.ok=TRUE)
  }

  if(identical(out_meas, "all")){
    out_meas <- all_out_meas
  } else {
    out_meas <- match.arg(out_meas, all_out_meas, several.ok=TRUE)
  }

  is.TRUEorFALSE <- function(x){is.logical(x) && length(x)==1}
  stopifnot(is.TRUEorFALSE(DVARS))
  stopifnot(is.TRUEorFALSE(detrend_PCs))
  stopifnot(is.TRUEorFALSE(kurt_detrend))
  stopifnot(is.TRUEorFALSE(id_outliers))
  stopifnot(is.TRUEorFALSE(lev_images))
  stopifnot(is.TRUEorFALSE(verbose))

  if(!identical(PCATF_kwargs, NULL)){
    names(PCATF_kwargs) <- match.arg(
      names(PCATF_kwargs), c("K", "lambda", "niter_max", "TOL", "verbose"),
      several.ok=TRUE)
    if(length(PCATF_kwargs) != length(unique(unlist(PCATF_kwargs)))){
      stop("Duplicate PCATF_kwargs were given.")
    }
  }

  stopifnot((kurt_quantile < 1) & (kurt_quantile > 0))
  
  if((lev_images) && (!id_outliers)){
    stop(
      "Invalid argument: computing leverage images requires\
      `id_outliers==TRUE`."
    )
  }

  stopifnot(is.numeric(lev_cutoff)); stopifnot(lev_cutoff > 0)

  stopifnot((rbd_cutoff > 0) & (rbd_cutoff < 1))

  # Get data dimensions.
  Npre_ <- ncol(X)
  T_ <- nrow(X)
  if(Npre_ < T_){
    warning(
      "Data matrix has more rows than columns. Check that observations\
      are in rows and variables are in columns."
    )
  }

  # Collect all the methods to compute.
  methods <- all_valid_methods[
    all_valid_methods %in% outer(projection, out_meas, paste, sep='__')
  ]
  if(length(methods) < 1){
    stop(
      "No valid method combinations. Check that `projection` and\
      `out_meas` are compatible.\n"
    )
  }
  outlier_measures <- outlier_lev_imgs <- setNames(vector("list", length(methods)), methods)
  if(id_outliers){
    outlier_cutoffs <- outlier_flags <- setNames(vector("list", length(methods)), methods)
  }
  robdist_info <- vector("list")

  # ----------------------------------------------------------------------------
  # Center and scale the data. -------------------------------------------------
  # Do it here instead of calling `scale_med` to save memory. ------------------
  # ----------------------------------------------------------------------------

  if(verbose){ cat("Centering and scaling the data matrix.\n") }
  # Transpose.
  X <- t(X)
  #	Center.
  X <- X - c(rowMedians(X, na.rm=TRUE))
  # Scale.
  mad <- 1.4826 * rowMedians(abs(X), na.rm=TRUE)
  const_mask <- mad < TOL
  if(any(const_mask)){
    if(all(const_mask)){
    stop("All voxels are zero-variance.\n")
    } else {
      warning(paste0("Warning: ", sum(const_mask),
      " constant voxels (out of ", length(const_mask),
      " ). These will be removed for estimation of the covariance.\n"))
    }
  }
  mad <- mad[!const_mask]
  X <- X[!const_mask,]
  X <- X/c(mad)
  # Revert transpose.
  X <- t(X)
  N_ <- ncol(X)

  # ----------------------------------------------------------------------------
  # Compute DVARS. -------------------------------------------------------------
  # ----------------------------------------------------------------------------

  if(DVARS){
    if(verbose){ cat("Computing DVARS.\n") }
    X_DVARS <- DVARS(X, normalize=FALSE, norm_I=100, verbose=verbose)
    
    outlier_measures$DVARS_DPD <- X_DVARS$DPD
    outlier_measures$DVARS_ZD <- X_DVARS$ZD

    if(id_outliers){
      outlier_cutoffs$DVARS_DPD <- 5
      outlier_cutoffs$DVARS_ZD <- qnorm(1-.05/T_)
      
      outlier_flags$DVARS_DPD <- X_DVARS$DPD > outlier_cutoffs$DVARS_DPD
      outlier_flags$DVARS_ZD <- X_DVARS$ZD > outlier_cutoffs$DVARS_ZD
      outlier_flags$DVARS_both <- outlier_flags$DVARS_DPD & outlier_flags$DVARS_ZD
    }
  }

  # ----------------------------------------------------------------------------
  # Make projections. ----------------------------------------------------------
  # ----------------------------------------------------------------------------

  # Compute the PC scores (and directions, if leverage images or PCATF are desired).
  solve_dirs <- (lev_images) || ("PCATF" %in% projection)
  if(verbose){
    cat(paste0(
      "Computing the",
      ifelse(
        ("PCA_var" %in% projection) | ("PCA_kurt" %in% projection),
        ifelse("PCATF" %in% projection, " normal and trend-filtered", ""),
        ifelse("PCATF" %in% projection, " trend-filtered", "INTERNAL ERROR")
      ),
      " PC scores",
      ifelse(solve_dirs, " and directions", ""), ".\n"
    ))
  }
  if(solve_dirs){
    X.svd <- svd(X)
    if(!("PCATF" %in% projection)){ rm(X) }
  } else {
    # Conserve memory by using `XXt`
    XXt <- tcrossprod(X)
    if(!("PCATF" %in% projection)){ rm(X) }
    X.svd <- svd(XXt)
    rm(XXt)
    X.svd$d <- sqrt(X.svd$d)
    X.svd$v <- NULL
  }

  # Compute PCATF, if requested.
  if("PCATF" %in% projection){
    X.svdtf <- do.call(
      PCATF, 
      c(list(X=X, X.svd=X.svd, solve_directions=solve_dirs), PCATF_kwargs)
    )
    # The PC directions were needed to compute PCATF. If leverage images 
    #   are not wanted, we can now delete the directions to save space.
    if(!lev_images){ X.svd$v <- NULL }

    # Remove trend-filtered PCs with constant scores.
    # TO DO: Reconsider adding this back?
    tf_zero_var <- apply(X.svdtf$u, 2, var) < TOL
    if(any(tf_zero_var)){
      if(all(tf_zero_var)){
        stop("Error: All trend-filtered PC scores are zero-variance.")
      }
      # warning(paste(
      #   "Warning:", sum(tf_zero_var), 
      #   "trend-filtered PC scores are zero-variance. Removing these PCs.\n"
      # ))
      # X.svdtf$u <- X.svdtf$u[,!tf_zero_var]
      # X.svdtf$d <- X.svdtf$d[!tf_zero_var]
      # if(lev_images){ X.svdtf$v <- X.svdtf$v[,!tf_zero_var] }
    }
  }
  gc()

  # Choose which PCs to retain for each projection.
  projection <- setNames(vector("list", length(projection)), projection)
  for (ii in 1:length(projection)) {
    proj_ii_name <- names(projection)[ii]

    if(verbose){
      cat(switch(proj_ii_name,
        PCA_var = "Identifying the PCs with high varaince.\n",
        PCA_kurt = "Identifying the PCs with high kurtosis.\n",
        PCATF = "Identifying the trend-filtered PCs with high varaince.\n"
      ))
    }

    # Identify the indices to keep.
    chosen_PCs <- switch(proj_ii_name,
      PCATF = 1:ncol(X.svdtf$u),
      PCA_var = choose_PCs.variance(svd=X.svd),
      PCA_kurt = choose_PCs.kurtosis(svd=X.svd, kurt_quantile=kurt_quantile, detrend=kurt_detrend)
    )
    ## kurtosis order =/= index order
    chosen_PCs_ordered <- chosen_PCs[order(chosen_PCs)]
    
    # Get the PC subset. Detrend if `detrend_PCs`.
    proj_ii_svd <- switch(proj_ii_name,
      PCATF = X.svdtf,
      PCA_var = X.svd,
      PCA_kurt = X.svd
    )
    proj_ii_svd$u <- proj_ii_svd$u[,chosen_PCs_ordered,drop=FALSE]
    proj_ii_svd$d <- proj_ii_svd$d[chosen_PCs_ordered]
    if(detrend_PCs & proj_ii_name!="PCATF"){
      proj_ii_svd$u_detrended <- proj_ii_svd$u - apply(proj_ii_svd$u, 2, est_trend)
      attributes(proj_ii_svd$u_detrended)$dimnames <- NULL
    } 
    if(lev_images){ proj_ii_svd$v <- proj_ii_svd$v[,chosen_PCs_ordered,drop=FALSE] }
    projection[[proj_ii_name]] = list(indices=chosen_PCs, svd=proj_ii_svd)
  }

  # ----------------------------------------------------------------------------
  # For each method... ---------------------------------------------------------
  # ----------------------------------------------------------------------------

  for(ii in 1:length(methods)){

    # --------------------------------------------------------------------------
    # Measure outlingness. -----------------------------------------------------
    # --------------------------------------------------------------------------

    method_ii <- methods[ii]
    method_ii_split <- unlist(strsplit(method_ii, "__"))
    proj_ii_name <- method_ii_split[1]; out_ii_name <- method_ii_split[2]
    proj_ii <- projection[[proj_ii_name]]
    
    if (verbose) { cat(paste0("Method ", method_ii, ":\n")) }

    # Adjust PC number if using robust distance.
    if (out_ii_name == "robdist") {
      # Let q <- N_/T_ (ncol(U)/nrow(U)). robustbase::covMcd()
      #  requires q <= approx. 1/2 for computation of the MCD covariance estimate.
      #  Higher q will use more components for estimation, thus retaining a
      #  higher resolution of information. Lower q will have higher breakdown
      #  points, thus being more resistant to outliers. (The maximal breakdown
      #  value is (N_ - T_ + 2)/2.) Here, we select q = 1/3 to yield a breakdown
      #  value of approx. 1/3. 
      #
      # For computational feasibility, we also limit the total number of PCs to 
      # 100.
      q <- 1/3
      max_keep = min(100, ceiling(T_*q))
      if (max_keep < length(proj_ii$indices)) {
        cat(paste(
          "\treducing number of PCs from", length(proj_ii$indices), "to", 
          max_keep, "\n"
        ))

        # Identify the indices to keep.
        if (proj_ii_name == "PCA_kurt") {
          max_idx <- sort(proj_ii$indices)[max_keep]
          proj_ii$indices <- proj_ii$indices[proj_ii$indices <= max_idx]
        } else {
          proj_ii$indices <- proj_ii$indices[1:max_keep]
        }

        # Take that SVD subset.
        proj_ii$svd$u <- proj_ii$svd$u[,1:max_keep,drop=FALSE]
        proj_ii$svd$d <- proj_ii$svd$d[1:max_keep]
        if (detrend_PCs && proj_ii_name!="PCATF") {
          proj_ii$svd$u_detrended <- proj_ii$svd$u_detrended[,1:max_keep,drop=FALSE]
        }
        if (solve_dirs) {
          proj_ii$svd$v <- proj_ii$svd$v[,1:max_keep,drop=FALSE]
        }
      }
    }

    # Compute the outlyingness measure.
    U_meas <- ifelse(detrend_PCs && proj_ii_name != "PCATF", "u_detrended", "u")
    U_meas <- proj_ii$svd[[U_meas]]
    meas_ii <- switch(out_ii_name,
      leverage = PC.leverage(U_meas),
      robdist = PC.robdist(U_meas)
    )

    # Extract in-MCD and F distribution information from the robust distances.
    if (out_ii_name == "robdist") {
      rbd_info_ii <- list(
        inMCD=meas_ii$inMCD,
        outMCD_scale=meas_ii$outMCD_scale,
        Fparam=c(meas_ii$Fparam$c, meas_ii$Fparam$m, meas_ii$Fparam$df[1], meas_ii$Fparam$df[2])
      )
      names(rbd_info_ii$Fparam) = c("c", "m", "df1", "df2")
      meas_ii <- meas_ii$mah
      robdist_info[[method_ii]] <- rbd_info_ii
    } 

    outlier_measures[[method_ii]] <- meas_ii
      
    # --------------------------------------------------------------------------
    # Identify outliers and make leverage images.-------------------------------
    # --------------------------------------------------------------------------

    # Identify outliers.
    if (id_outliers) {
      if (verbose) { cat("...identifying outliers.\n") }

      cut_ii <- switch(out_ii_name,
        leverage = outs.leverage(meas_ii, median_cutoff=lev_cutoff),
        robdist = outs.robdist(
          meas_ii, 
          rbd_info_ii$Fparam[["df1"]], rbd_info_ii$Fparam[["df2"]],
          rbd_info_ii$inMCD, rbd_info_ii$outMCD_scale, rbd_cutoff
        )
      )
      outlier_cutoffs[[method_ii]] <- cut_ii$cut
      outlier_flags[[method_ii]] <- cut_ii$flag
    }

    # Make leverage images.
    if(lev_images){
      if(sum(cut_ii$flag) > 0){
        if(verbose){
          cat("...outliers detected. Computing leverage images.\n")
        }
        outlier_lev_imgs[[method_ii]] <- get_leverage_images(
          proj_ii$svd, which(cut_ii$flag), const_mask
        )
      } else {
        if(verbose){
          cat("...no outliers detected. Skipping leverage images.\n")
        }
        outlier_lev_imgs[[method_ii]] <- list(mean=NULL, top=NULL, top_dir=NULL)
      }
    }
  }

  # ----------------------------------------------------------------------------
  # Format output. -------------------------------------------------------------
  # ----------------------------------------------------------------------------

  # Conserve memory.
  gc()

  if (length(robdist_info) < 1) {robdist_info <- NULL}

  # Organize the output.
  if (verbose) { cat("Done! Organizing results.\n") }
  result <- list(
    params = NULL, 
    projections = projection, 
    outlier_measures = outlier_measures
  )
  if ("robdist" %in% out_meas) {
    result$robdist_info <- robdist_info
  }
  if (id_outliers) {
    result$outlier_cutoffs <- outlier_cutoffs
    result$outlier_flags <- outlier_flags
  }
  if (lev_images) { result$outlier_lev_imgs <- outlier_lev_imgs }
  result$params <- list(
    projection = names(projection),
    out_meas = out_meas,
    DVARS = DVARS,
    detrend_PCs = detrend_PCs,
    PCATF_kwargs = PCATF_kwargs,
    kurt_quantile = kurt_quantile,
    kurt_detrend = kurt_detrend,
    id_outliers = id_outliers,
    lev_cutoff = lev_cutoff,
    rbd_cutoff = rbd_cutoff,
    lev_images = lev_images,
    verbose = verbose
  )

  structure(result, class="clever")
}