R/rcur.R
In rsvd: Randomized Singular Value Decomposition

Documented in rcur

#' @title  Randomized CUR matrix decomposition.
#
#' @description Randomized CUR matrix decomposition.
#
#' @details
#' Algorithm for computing the CUR matrix decomposition of a rectangular \eqn{(m, n)} matrix \eqn{A}, 
#' with target rank \eqn{k << min(m,n)}. The input matrix is factored as 
#' 
#' \deqn{A = C * U * R} 
#' 
#' using the \code{\link{rid}} decomposition. The factor matrix \eqn{C} is formed using actual 
#' columns of \eqn{A}, also called the partial column skeleton. The factor matrix \eqn{R} is formed 
#' using actual rows of \eqn{A}, also called the partial row skeleton.
#' 
#' If \eqn{rand=TRUE} a probabilistic strategy is used to compute the decomposition, otherwise a
#' deterministic algorithm is used. 
#'
#'
#' @param A   array_like; \cr
#'            numeric \eqn{(m, n)} input matrix (or data frame). \cr
#'            If the data contain \eqn{NA}s na.omit is applied.
#'            
#' @param k   integer; \cr
#'            target rank of the low-rank approximation, i.e., the number of columns/rows
#'            to be selected. It is required that \eqn{k} is smaller or equal to \eqn{min(m,n)}.
#'        
#' @param p   integer, optional; \cr
#'            oversampling parameter (default \eqn{p=10}).
#'
#' @param q   integer, optional; \cr
#'            number of additional power iterations (default \eqn{q=0}).
#'
#' @param idx_only  bool, optional; \cr
#'              if (\eqn{TRUE}), only the index set \code{C.idx} and \code{R.idx} is returned, but not 
#'              the matrices \code{C} and \code{R}. 
#'              This is more memory efficient, when dealing with large-scale data. 
#'              
#' @param rand  bool, optional; \cr
#'              if (\eqn{TRUE}), a probabilistic strategy is used, otherwise a deterministic algorithm is used.                
#'
#'
#' @return \code{rcur} returns a list with class \eqn{id} containing the following components:
#'\describe{
#'    \item{C}{ array_like; \cr
#'              column subset \eqn{C = A[,C.idx]}; \eqn{(m, k)} dimensional array.
#'    }
#'
#'    \item{R}{ Array_like. \cr
#'               row subset \eqn{R = A[R.idx, ]}; \eqn{(k, n)} dimensional array.
#'    }
#'    
#'    \item{U}{ array_like; \cr
#'             connector matrix; \eqn{(k,k)} dimensional array.
#'    }
#'    
#'    \item{C.idx}{ array_like; \cr
#'                index set of the \eqn{k} selected columns used to form \eqn{C}. 
#'    }   
#' 
#'    \item{R.idx}{ array_like; \cr
#'                index set of the \eqn{k} selected rows used to form \eqn{R}. 
#'    }   
#'       
#'      
#'    \item{C.scores}{ array_like; \cr
#'                   scores of the selected columns.
#'    } 
#' 
#'    \item{R.scores}{ array_like; \cr
#'                   scores  of the selected rows.
#'    }                   
#'}                                                       
#'
#' @references
#' \itemize{
#'  \item [1] N. B. Erichson, S. Voronin, S. L. Brunton and J. N. Kutz. 2019.
#'        Randomized Matrix Decompositions Using {R}. 
#'        Journal of Statistical Software, 89(11), 1-48.
#'         \doi{10.18637/jss.v089.i11}.
#' 
#'   \item  [2] N. Halko, P. Martinsson, and J. Tropp.
#'          "Finding structure with randomness: probabilistic
#'          algorithms for constructing approximate matrix
#'          decompositions" (2009).
#'          (available at arXiv \url{https://arxiv.org/abs/0909.4061}).
#'}
#'
#'
#' @author N. Benjamin Erichson, \email{erichson@berkeley.edu}
#'
#' @seealso \code{\link{rid}}
#'
#'
#'
#' @examples
#' \dontrun{
#' # Load test image
#' data('tiger')
#'
#' # Compute (column) randomized interpolative decompsition
#' # Note that the image needs to be transposed for correct plotting
#' out <- rcur(tiger, k = 150)
#'
#' # Reconstruct image
#' tiger.re <- out$C %*% out$U %*% out$R
#'
#' # Compute relative error
#' print(norm(tiger-tiger.re, 'F') / norm(tiger, 'F')) 
#'
#' # Plot approximated image
#' image(tiger.re, col = gray((0:255)/255))
#' }

#' @export
rcur <- function(A, k = NULL, p = 10, q = 0, idx_only = FALSE, rand = TRUE) UseMethod("rcur")

#' @export
rcur.default <- function(A, k = NULL, p = 10, q = 0, idx_only = FALSE, rand = TRUE) {

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Checks
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if (any(is.na(A))) {
    warning("Missing values are omitted: na.omit(A).")
    A <- stats::na.omit(A)
  }     

  m <- nrow(A)
  n <- ncol(A)  
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Init id object
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  rcurObj = list( C = NULL,
                R = NULL,
                U = NULL,
                C.idx = NULL,
                R.idx = NULL,
                C.scores = NULL,
                R.scores = NULL,
                rand = rand)  
  

  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Set target rank
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(is.null(k)) k <- min(n,m)
  if(k > min(m,n)) k <- min(m,n)
  if(k < 1) stop("Target rank is not valid!")
  
  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #Compute column ID
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 
  out_cid <- rid(A, k = k, p = p, q = q, mode = 'column', idx_only = TRUE, rand = rand)
  
  # Select column subset
  if(idx_only == FALSE) {
    rcurObj$C <- matrix(A[, out_cid$idx], nrow = m, ncol = k)    
    colnames(rcurObj$C) <- colnames(A)[out_cid$idx]
    rownames(rcurObj$C) <- rownames(A)
  } 
  
  rcurObj$C.idx <- out_cid$idx
  rcurObj$C.scores <- out_cid$scores

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #Compute row ID of C
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #out_rid <- rid(A[, rcurObj$C.idx], k = k, p = p, q = q, mode = 'row', idx_only = TRUE, rand = FALSE)
  out <- qr(H(A[, rcurObj$C.idx]), LAPACK=TRUE) 
  
  # Get index set
  rcurObj$R.idx <- out$pivot[1:k] # Get row set

  # Get R =: S
  S <- qr.R( out ) 
  #Q <- qr.Q( out ) 
  
  # Select row subset
  if(idx_only == FALSE) {
    rcurObj$R <- matrix(A[rcurObj$R.idx,], nrow = k, ncol = n) 
    rownames(rcurObj$R) <- rownames(A)[rcurObj$R.idx]
    colnames(rcurObj$R) <- colnames(A)
  } 
  
  rcurObj$R.scores <- abs(diag(S))[1:k]

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #Compute U
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  #rcurObj$U = Z %*% MASS::ginv(matrix(A[rcurObj$R.idx, ], nrow = k, ncol = n) )    
  rcurObj$U = out_cid$Z %*% pinv(matrix(A[rcurObj$R.idx, ], nrow = k, ncol = n))

  #colinv = matrix(0, 1 , n)
  #colinv[out_cid$pivot] = 1:n
  
  #V = rbind(diag(k), H(out_cid$Z))[colinv,]
  
  # Rt*Ut = V via RRt*Ut = R*V
  #RV = rcurObj$R %*% V
  #rcurObj$U = H(pinv(rcurObj$R %*% H(rcurObj$R)) %*% RV)

  # Rt*Ut = V
  #rcurObj$U = H(pinv(H(rcurObj$R)) %*% V)

  
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Return
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  class(rcurObj) <- "rcur"
  return( rcurObj )
}