R/optimal_SVHT_coef.R

Defines functions marcenko_pastur_integral median_marcenko_pastur inc_mar_pas optimal_SVHT_coef_sigma_known optimal_SVHT_coef_sigma_unknown optimal_SVHT_coef sv_threshold

Documented in optimal_SVHT_coef sv_threshold

# Thanks to RobertGM111 at
# https://github.com/RobertGM111/havok/blob/master/R/optimal_SVHT_coef.R
# Version April 6, 2020


#' Singular value thresholding
#'
#' Singular value thresholding evaluates the optimal number of singular values to retain
#'
#' @param n number of samples
#' @param p number of features
#' @param d singular values
#'
#' @return Number of singular values to retain
#'
#' @references Gavish, M., & Donoho, D. L. (2014). The optimal hard threshold for singular values is 4/sqrt(3). IEEE Transactions on Information Theory, 60(8), 5040-5053.
#'
#' @examples
#' # simulate data
#'
#' n <- 500
#' p <- 5000
#' Y <- Rfast::matrnorm(n, p, seed = 1)
#'
#' # SVD
#' dcmp <- svd(Y)
#'
#' # how many components to retain
#' sv_threshold(n, p, dcmp$d)
#'
#' # in this case the data has no structure, so no components are retained
#'
#' @importFrom stats median
#' @export
sv_threshold <- function(n, p, d) {
  # get cutoff
  cutoff <- (optimal_SVHT_coef(min(p / n, n / p), 0) * median(d))

  # number of SV exceeding cutoff
  sum(d >= cutoff)
}


#' Optimal Hard Threshold for Singular Values
#'
#' @description A function for the calculation of the coefficient determining optimal
#' location of hard threshold for matrix denoising by singular values hard thresholding
#' when noise level is known or unknown. Recreation of MATLAB code by Matan Gavish and
#' David Donoho.
#' @param beta A single value or a vector that represents aspect ratio m/n of the matrix to
#' be denoised. 0<\code{beta}<=1.
#' @param sigma_known A logical value. TRUE if noise level known, FALSE if unknown.
#' @return  Optimal location of hard threshold, up the median data singular value (sigma
#' unknown) or up to \code{sigma*sqrt(n)} (sigma known); a vector of the same dimension
#' as \code{beta}, where \code{coef[i]} is the coefficient corresponding to \code{beta[i]}.
#' @references Gavish, M., & Donoho, D. L. (2014). The optimal hard threshold for singular
#' values is 4/sqrt(3). IEEE Transactions on Information Theory, 60(8), 5040-5053.
# @examples # Usage in known noise level: # Given an m-by-n matrix \code{Y}
# known to be low rank and observed in white noise # with mean zero and known
# variance \code{sigma^2}, form a denoised matrix \code{Xhat} by: Y <-
# Rfast::matrnorm(1000, 100) sigma <- .2 USV <- svd(Y) y <- USV$d y[y <
# (optimal_SVHT_coef(m/n,1) * sqrt(n) * sigma) ] <- 0 Xhat <- USV$u * diag(y) *
# t(USV$v) # Given an m-by-n matrix \code{Y} known to be low rank and observed
# in white # noise with mean zero and unknown variance, form a denoised matrix
# \code{Xhat} by: USV <- svd(Y) y <- USV$d y[y < (optimal_SVHT_coef(m/n,0) *
# median(y)) ] <- 0 Xhat <- USV$u * diag(y) * t(USV$v)
optimal_SVHT_coef <- function(beta, sigma_known = FALSE) {
  if (sigma_known == TRUE) {
    coef <- optimal_SVHT_coef_sigma_known(beta)
  } else {
    coef <- optimal_SVHT_coef_sigma_unknown(beta)
  }
}

optimal_SVHT_coef_sigma_unknown <- function(beta) {
  coef.unknown <- optimal_SVHT_coef_sigma_known(beta)
  MPmedian <- rep(0, length(beta))
  for (i in seq_len(length(beta))) {
    MPmedian[i] <- median_marcenko_pastur(beta[i])
  }
  omega <- coef.unknown / sqrt(MPmedian)
  return(omega)
}

optimal_SVHT_coef_sigma_known <- function(beta) {
  w <- (8 * beta) / (beta + 1 + sqrt(beta^2 + 14 * beta + 1))
  lambda_star <- sqrt(2 * (beta + 1) + w)
  return(lambda_star)
}

inc_mar_pas <- function(x0, beta, gamma) {
  if (beta > 1) {
    stop("Beta must be <= 1")
  }
  topSpec <- (1 + sqrt(beta))^2
  botSpec <- (1 - sqrt(beta))^2
  MarPas <- function(x) {
    ifelse((topSpec - x) * (x - botSpec) > 0, sqrt((topSpec -
      x) * (x - botSpec)) / (beta * x) / (2 * pi), 0)
  }
  if (gamma != 0) {
    fun <- function(x) (x^gamma * MarPas(x))
  } else {
    fun <- function(x) MarPas(x)
  }
  Int <- stats::integrate(fun, x0, topSpec)
  return(Int$value)
}

median_marcenko_pastur <- function(beta) {
  MarPas <- function(x) 1 - inc_mar_pas(x, beta, 0)
  lobnd <- (1 - sqrt(beta))^2
  hibnd <- (1 + sqrt(beta))^2
  change <- 1
  while (change & (hibnd - lobnd > 0.001)) {
    change <- 0
    x <- seq(lobnd, hibnd, length.out = 5)
    y <- rep(NA, length(x))
    for (i in seq_len(length(x))) {
      y[i] <- MarPas(x[i])
    }
    if (any(y < 0.5)) {
      lobnd <- max(x[which(y < 0.5)])
      change <- 1
    }
    if (any(y > 0.5)) {
      hibnd <- min(x[which(y > 0.5)])
      change <- 1
    }
  }
  med <- (hibnd + lobnd) / 2
  return(med)
}

marcenko_pastur_integral <- function(x, beta) {
  if (beta <= 0 | beta > 1) {
    stop("beta must be > 0 and <=1")
  }
  lobnd <- (1 - sqrt(beta))^2
  hibnd <- (1 + sqrt(beta))^2
  if (x < lobnd | x > hibnd) {
    stop("X is out of bounds")
  }
  dens <- function(t) sqrt((hibnd - t) * (t - lobnd)) / (2 * pi * beta * t)
  Int <- stats::integrate(dens, lobnd, x)
  # print(cat('\r', paste('x=', x, ' beta=', beta, ' I=', round(Int$value,
  # 5), sep = '')))
  return(Int$value)
}




# Copyright 2020 Robert Glenn Moulder Jr. & Elena Martynova Licensed under the
# Apache License, Version 2.0 (the 'License'); you may not use this file except
# in compliance with the License.  You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law
# or agreed to in writing, software distributed under the License is
# distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the specific language
# governing permissions and limitations under the License.

Try the decorrelate package in your browser

Any scripts or data that you put into this service are public.

decorrelate documentation built on Aug. 8, 2025, 7:55 p.m.