R/opnmfR.R

Defines functions opnmfR_nndsvd opnmfR_init opnmfR_ranksel_perm opnmfR_ranksel_splithalf_select opnmfR_ranksel_splithalf opnmfR_cosine_similarity opnmfR_reconstruct_W opnmfR_reconstruct_H opnmfR_ranksel_ooser chunk opnmfR_mse opnmfR_postprocess opnmfR opnmfRcpp opnmfR_compare_nmf opnmfR_test_ranksel_synthetic opnmfR_test_ranksel opnmfR_test

Documented in opnmfR opnmfR opnmfRcpp opnmfR_ranksel_perm opnmfR_ranksel_splithalf opnmfR_test opnmfR_test_ranksel opnmfR_test_ranksel_synthetic

# partly based on `brainparts`: https://github.com/asotiras/brainparts

#' opnmfR: A package for orthonormal projective non-negative matrix factorization
#'
#' This package provides functions for opnmf factorization, initialization, and 
#' rank election using R and Rcpp code
#' @docType package
#' @name opnmfR
#' @import Rcpp NMF lpSolve aricode
#' @useDynLib opnmfR, .registration=TRUE
NULL
#> NULL

#' opnmf test
#'
#' @param X A matrix, if NULL the "iris" data is used (default NULL)
#' @param y A vector with class labels same length as nrow of X, if X=NULL then iris$Species is used (default NULL)
#' @param r A number, rank to use (default 2)
#' @param W0 A string or matrix for initialization (default "nndsvd")
#' @param ... additional parameters passed to \code{opnmfR} and \code{opnmfRcpp}
#' @return A list with factorization results using R and Rcpp function calls
#' @examples
#' result <- opnmfR_test()
#' @export
opnmfR_test <- function(X=NULL, y=NULL, r=2, W0="nndsvd", ...) {
  if(is.null(X)) {
    data("iris")
    X <- data.matrix(iris[,1:4])
    y <- iris$Species
  }
  
  if(is.null(y)) {
    y <- rep(1, nrow(X))
  } else {
    stopifnot(length(y)==nrow(X))
  }
  
  # using R
  cat("opnmfR test ")
  start_time <- Sys.time()
  nn <- opnmfR(X, r=r, W0=W0, ...)
  end_time <- Sys.time()
  show(end_time - start_time)
  show(nn$H)
  if(r>=2) {
    plot(nn$W[,1], nn$W[,2], col=y, xlab="Factor 1", ylab="Factor 2")
  }
  
  # using rcpp
  cat("opnmfRcpp test ")
  start_time <- Sys.time()
  nncpp <- opnmfRcpp(X, r=r, W0=W0, ...)
  end_time <- Sys.time()
  show(end_time - start_time)
  show(nn$H)
  
  par(mfrow=c(1,2))
  image(nn$W, main="W")
  image(nn$H, main="H")
  par(mfrow=c(1,1))
  
  return(list(nn=nn, nncpp=nncpp))
}

#' Rank selection test on user provided data
#' Runs all three avilable rank selection methods.
#' @param X A matrix, if NULL the "iris" data is used (default NULL)
#' @param rs A vector of ranks to test for selection, 
#' if rs=NULL then \code{1:nrow(X)} is used (default NULL)
#' @param W0 A string or matrix for initialization (default "nndsvd")
#' @param nrepeat A number, number of iterations for rank selection, 
#' i.e. number of permutations for \code{opnmfR_ranksel_perm} 
#' and number of split-halves for \code{opnmfR_ranksel_splithalf} (default 1)
#' @return A list with rank selection outputs from \code{opnmfR_ranksel_perm}, \code{opnmfR_ranksel_ooser}, and
#' \code{opnmfR_ranksel_splithalf}
#' @seealso \code{opnmfR_ranksel_perm}, \code{opnmfR_ranksel_ooser}, and 
#' \code{opnmfR_ranksel_splithalf}
#' @examples
#' result <- opnmfR_test_ranksel()
#' @export
opnmfR_test_ranksel <- function(X=NULL, rs=NULL, W0="nndsvd", nrepeat=1) {
  if(is.null(X)) {
    data("iris")
    cat("opnmfR ranksel ")
    start_time <- Sys.time()
    X <- data.matrix(iris[,1:4])
    X <- t(cbind(X, X)) # duplicate columns and transpose, i.e. factorize features
  }
  
  if(is.null(rs)) rs <- 1:nrow(X)
  
  perm <- opnmfR_ranksel_perm(X, rs, W0=W0, nperm=nrepeat)
  ooser <- opnmfR_ranksel_ooser(X, rs, W0=W0)
  splithalf <- opnmfR_ranksel_splithalf(X, rs, W0=W0, nrepeat=nrepeat)
  
  return(list(perm=perm, ooser=ooser, splithalf=splithalf))
}

#' Rank selection test on synthetic data
#' Runs all three avilable rank selection methods.
#' @param n A number, number of rows (default 100)
#' @param r A number, true rank (default 10)
#' @param p A number, number of columns (default 100)
#' @param rs A vector with  the ranks  you wish to test for selection, 
#' if rs=NULL then \code{1:min(nrow(X), round(r+r*0.5))} is used (default NULL)
#' @param W0 A string or matrix for initialization (default "nndsvd")
#' @param nrepeat A number, number of iterations for rank selection, 
#' i.e. number of permutations for \code{opnmfR_ranksel_perm} 
#' and number of split-halves for \code{opnmfR_ranksel_splithalf} (default 1)
#' @return A list with rank selection outputs from \code{opnmfR_ranksel_perm}, \code{opnmfR_ranksel_ooser}, and
#' \code{opnmfR_ranksel_splithalf}
#' @seealso \code{opnmfR_ranksel_perm}, \code{opnmfR_ranksel_ooser}, and 
#' \code{opnmfR_ranksel_splithalf}
#' @examples
#' result <- opnmfR_test_ranksel_synthetic()
#' @export
opnmfR_test_ranksel_synthetic <- function(n=100, r=10, p=100, rs=NULL, W0="nndsvd", nrepeat=1) {
  stopifnot(r <= 20) # keep the test computable
  stopifnot(r < n && r < p)
  X <- syntheticNMF(n,r,p)
  if(is.null(rs)) rs <- 1:min(nrow(X), round(r+r*0.5))

  perm <- opnmfR_ranksel_perm(X, rs, W0=W0, nperm=nrepeat, rtrue=r)
  ooser <- opnmfR_ranksel_ooser(X, rs, W0=W0, rtrue=r)
  splithalf <- opnmfR_ranksel_splithalf(X, rs, W0=W0, nrepeat=nrepeat, rtrue=r)
  
  return(list(perm=perm, ooser=ooser, splithalf=splithalf))
}

#' @export
opnmfR_compare_nmf <- function(n=100, r=10, p=100, nmfalgo="snmf/r", rs=NA) {
  stopifnot(r < n && r < p)
  if(is.na(rs)) rs <- r
  stopifnot(rs < n && rs < p)
  X <- syntheticNMF(n,r,p)
  sim_cosine <- rep(NA, r)
  ari <- rep(NA, r)
  for(rr in 1:rs) {
    fac1 = nmf(X, rr, nmfalgo)
    fac2 = opnmfR(X, rr)
    sim <- opnmfR_cosine_similarity(t(fac1@fit@W), t(fac2$W))
    lp <- lp.assign(sim, direction = "max")
    sim_cosine[rr] <- lp$objval/rr
    if(rr > 1) {
      ari[rr] <- ARI(apply(fac1@fit@W,1,which.max), apply(fac2$W,1,which.max))
    }
  }
  par(mfrow=c(1,2))
  plot(sim_cosine, type="b", xlab="Rank", ylab="Similarity (cosine)", main=paste("NMF", nmfalgo), pch=16)
  abline(v=r, lty=2, col="gray")
  plot(ari, type="b", xlab="Rank", ylab="Similarity (ARI)", main=paste("NMF", nmfalgo), pch=17)
  abline(v=r, lty=2, col="gray")
  par(mfrow=c(1,1))
  
  return(list(sim_cosine=sim_cosine, ari=ari))
}

#' Orthogonal Projective Non-Negative Factorization
#' utilising R C++ integration for improved computational speed. 
#' Implementation is partly based on brainparts: https://github.com/asotiras/brainparts 
#' @param X A matrix, rows are the number of features and columns are number of samples
#' @param r A number, rank used for factorization.
#' @param W0 A string or matrix for initialization (default NULL)
#' @param max.iter A number, number of iterations before stopping (default 50000) 
#' @param tol A number, convergence tolerance (default 1e-5)  
#' @param memsave A logical, if TRUE update rule is modified to better deal with high
#' dimensional data (default TRUE)
#' @param eps A number, (default 1e-16)
#' @return A list containing the approximation matrices W and H, of input matrix
#' X; the interation number at which a solution was found; 
#' the reconstruction error 
#' @examples
#'  result <- opnmfRcpp()
#' @export
opnmfRcpp <- function(X,r,W0=NULL,max.iter=50000,tol=1e-5,memsave=TRUE,eps=1e-16,...) {
  start_time <- Sys.time()
  if(is.null(W0) || is.character(W0)) W0 <- opnmfR_init(X, r, W0)
  W0[W0 < eps] <- 0
  
  res <- opnmfR:::opnmfR_opnmfRcpp(X, W0, tol, max.iter, eps, memsave)
  
  post <- opnmfR_postprocess(X, res$W)
  
  end_time <- Sys.time()
  tot_time <- end_time - start_time
  
  return(list(W=post$W, H=post$H, diffW=res$diffW, iter=res$iter, mse=post$mse, W0=W0, time=tot_time))
}

#' Orthogonal Projective Non-Negative Factorization
#' Implementation is partly based on brainparts: https://github.com/asotiras/brainparts 
#' @param X A matrix, rows are the number of features and columns are number of samples
#' @param r A number, rank used for factorization.
#' @param W0 A string or matrix for initialization (default NULL)
#' @param max.iter A number, number of iterations before stopping (default 50000) 
#' @param tol A number, convergence tolerance (default 1e-5)  
#' @param memsave A logical, if TRUE update rule is modified to better deal with high
#' dimensional data (default TRUE)
#' @param eps A number, (default 1e-16)
#' @param use.gpu A logical, conduct factorizatio on GPUs using gpuR (default FALSE)
#' @return A list containing the approximation matrices W and H, of input matrix
#' X; the interation number at which a solution was found; 
#' the reconstruction error
#' @examples
#'  result <- opnmfR()
#' @export
opnmfR <- function(X,r,W0=NULL,max.iter=50000,tol=1e-5,memsave=TRUE,eps=1e-16, use.gpu=FALSE, ...) {
  # mem=FALSE is faster for large data
  start_time <- Sys.time()
  
  if(any(is.na(X))) {
    Xmean <- median(X, na.rm=TRUE)
    X[is.na(X)] <- Xmean
  }
  
  if(is.null(W0) || is.character(W0)) W0 <- opnmfR_init(X, r, W0)
  W0[W0 < eps] <- 0
  
  W <- W0
  if(!memsave) { XX <- X %*% t(X) }
  

  if(use.gpu) {
     library(gpuR)
     X <- gpuMatrix(X)
     W <- gpuMatrix(W)
     if(!memsave) XX <- gpuMatrix(XX)
  }
 
  pbar <- txtProgressBar(min=0, max=max.iter, style=1)
  for(iter in 1:max.iter) {
    Wold <- W

    # two "types" of update rules
    # "simplified" update rule using XXW
    if(!memsave) XXW <- XX %*% W
    else XXW <- X %*% crossprod(X,W)
    
    W <- W * XXW / (W  %*% crossprod(W,XXW))
    
    # these are the "original" update rules
    #if(!memsave) W <- W * (XX %*% W) / (W %*% (t(W) %*% XX %*% W))
    #else W <- W * (X %*% (t(X) %*% W)) / (W %*% ((t(W) %*% X) %*% (t(X) %*% W))) 
   
    if(!use.gpu) {
      W[W < eps] <- eps
      W <- W / norm(W,"2") # spectral norm
    } else {
      n2 <- svd(t(W) %*% W, 1, 1)
      n2 <- sqrt(as.vector(n2$d)[1])
      W <- W / n2
    }
    
    diffW <- norm(Wold-W, "F") / norm(Wold, "F")
    setTxtProgressBar(pbar, iter)
    if(diffW < tol) break
  }
  close(pbar)
 
  if(use.gpu) W <- as.matrix(W)
 
  post <- opnmfR_postprocess(X, W)
  
  end_time <- Sys.time()
  tot_time <- end_time - start_time
  
  return(list(W=post$W, H=post$H, diffW=diffW, iter=iter, mse=post$mse, W0=W0, time=tot_time))
}

#' @export
opnmfR_postprocess <- function(X, W) {
  # get H
  H <- t(W) %*% X
  
  # reorder
  hlen <- sqrt(apply(H, 1, function(xx) sum(xx^2)))
  if(any(hlen==0)) cat('WARN low rank:', sum(hlen==0), 'factors have norm 0')
  hlen[hlen==0]  <- 1
  
  Wh <- W
  for(i in 1:ncol(W)) Wh[,i] <- Wh[,i] * hlen[i]
  Whlen <- apply(Wh, 2, function(xx) sum(xx^2))
  ix <- sort(Whlen, decreasing = TRUE, index.return=TRUE)$ix
  W <- W[,ix,drop=FALSE]
  
  # get H from reordered W
  H <- t(W) %*% X
  
  # reconstruction error
  mse <- norm(X-(W %*% H), "F")
  
  return(list(W=W, H=H, mse=mse))
}

#' @export
opnmfR_mse <- function(X, W, H) {
  return(norm(X-(W %*% H), "F"))
}


chunk <- function(x,n) split(x, cut(seq_along(x), n, labels = FALSE)) 

#' @export
opnmfR_ranksel_ooser <- function(X, rs, W0=NULL, use.rcpp=TRUE, nrepeat=1, nfold=2, plots=TRUE, seed=NA, rtrue=NA, ...) {
  # we create folds over the columns
  stopifnot(ncol(X)>=max(rs))
  start_time <- Sys.time()
  
  if(is.na(seed)) seed <- sample(1:10^6, 1)
  mse <- list()
  nrun <- 0
  for(n in 1:nrepeat) {
    mse[[n]] <- list()
    mse[[n]]$train <- matrix(NA, nfold, length(rs))
    mse[[n]]$test <- matrix(NA, nfold, length(rs))
    mse[[n]]$test_delta <- matrix(NA, nfold, length(rs))
    mse[[n]]$test_test <- matrix(NA, nfold, length(rs))
    mse[[n]]$test_train <- matrix(NA, nfold, length(rs))
    # get folds
    # we need to fold over the columns
    set.seed(seed+n)
    folds <- chunk(sample(1:ncol(X)), nfold)
    for(f in 1:nfold) {
      testidx <- folds[[f]]
      trainidx <- setdiff(1:ncol(X), testidx)
      cat("repeat", n, ": fold", f, ": #test",  length(testidx), ": #train",  length(trainidx), ":")
      
      factrain <- list()
      factest <- list()
      for(r in 1:length(rs)) {
        # get factors
        if(use.rcpp) {
          factrain[[r]] <- opnmfRcpp(X[,trainidx], rs[r], W0=W0, ...)
          factest[[r]] <- opnmfRcpp(X[,testidx], rs[r], W0=W0, ...)
        } else {
          factrain[[r]] <- opnmfR(X[,trainidx], rs[r], W0=W0, ...)
          factest[[r]] <- opnmfR(X[,testidx], rs[r], W0=W0, ...)
        }
        
        # training error
        mse[[n]]$train[f,r] <- factrain[[r]]$mse
        
        # project and get reconstruction error
        #Xtest_test <- factest[[r]]$W %*% factest[[r]]$H
        #Xtest_train <- factest[[r]]$W %*% factrain[[r]]$H
        # reconstruct test data using test data factors
        xtest_test <- opnmfR_reconstruct_W(X[,testidx], factest[[r]]$W)
        # reconstruct test data using training data factors
        xtest_train <- opnmfR_reconstruct_W(X[,testidx], factrain[[r]]$W)
        mse[[n]]$test[f,r] <- norm(xtest_test - xtest_train, "F")
        mse[[n]]$test_test[f,r] <- norm(xtest_test - X[,testidx], "F")
        mse[[n]]$test_train[f,r] <- norm(xtest_train - X[,testidx], "F")
        mse[[n]]$test_delta[f,r] <- abs(norm(X[,testidx] - xtest_train, "F") - norm(xtest_test - X[,testidx], "F"))
        
        nrun <- nrun + 1
        cat("test err", mse[[n]]$test[f,r], "\n#######\n")
      } # rs
    } # nfold
  } # nrepeat
  
  errtrain <- do.call(rbind, lapply(mse, function(z) apply(z$train, 2, mean)))
  errtest <- do.call(rbind, lapply(mse, function(z) apply(z$test, 2, mean)))
  errtest_delta <- do.call(rbind, lapply(mse, function(z) apply(z$test_delta, 2, mean)))
  
  mse$train <- errtrain
  mse$test <- errtest
  mse$test_delta <- errtest_delta
  rownames(mse$train) <- paste("repeat", 1:nrepeat, sep="")
  colnames(mse$train) <- paste("rank", rs, sep="")
  rownames(mse$test) <- paste("repeat", 1:nrepeat, sep="")
  colnames(mse$test) <- paste("rank", rs, sep="")
  rownames(mse$test_delta) <- paste("repeat", 1:nrepeat, sep="")
  colnames(mse$test_delta) <- paste("rank", rs, sep="")
  
  errtrain <- apply(errtrain, 2, mean)
  errtest <- apply(errtest, 2, mean)
  errtest_delta <- apply(errtest_delta, 2, mean)
  
  # select rank(s)
  selr_ind <- which.min(errtest)
  selr <- rs[selr_ind]
  selr_delta_ind <- rs[which.min(errtest_delta)]
  selr_delta <- rs[selr_delta_ind]
  
  if(plots) {
    par(mfrow=c(1,3))
    plot(rs, errtrain, main="Train", ylab="MSE", xlab="Rank")
    if(!is.na(rtrue)) abline(v=rtrue, lty=2, col="gray")
    plot(rs, errtest, main="Test", ylab="MSE", xlab="Rank")
    points(selr, errtest[selr_ind], cex=1.5, col="red", pch=10)
    if(!is.na(rtrue)) abline(v=rtrue, lty=2, col="gray")
    plot(rs, errtest_delta, main="Test", ylab="MSE (delta)", xlab="Rank")
    points(selr_delta, errtest_delta[selr_delta_ind], cex=1.5, col="red", pch=10)
    if(!is.na(rtrue)) abline(v=rtrue, lty=2, col="gray")
    par(mfrow=c(1,1))
  }
  
  end_time <- Sys.time()
  tot_time <- end_time - start_time
  
  return(list(mse=mse, time=tot_time, seed=seed, selected=selr, selected_delta=selr_delta))
}


#' @export
opnmfR_reconstruct_H <- function(X, H) {
  library(MASS) # for ginv
  X <- abs(X %*% (t(H) %*% ginv(H %*% t(H))) %*% H)
  return(X)
}


#' @export
opnmfR_reconstruct_W <- function(X, W) {
  library(MASS) # for ginv
  X <- abs((W %*% ginv(W)) %*% X)
  return(X)
}

opnmfR_cosine_similarity <- function(x, y){
  xy <- tcrossprod(x, y)
  x <- sqrt(apply(x, 1, crossprod))
  y <- sqrt(apply(y, 1, crossprod))
  xy / outer(x,y)
}

#' Rank selection based on split-half similarity measures (cosine, inner-product, cosine 
#' correlation, and adjusted rand index)
#' @param X A matrix, rows are the number of features and columns are number of samples
#' @param rs A vector of ranks to test for selection, 
#' if rs=NULL then \code{1:nrow(X)} is used (default NULL)
#' @param W0 A string or matrix for initialization (default NULL)
#' @param use.rcpp A logical, use \code{opnmfRcpp()} (default TRUE)
#' dimensional data (default TRUE)
#' @param nrepeat A number, number of splits (default 1)
#' @param similarity .... (deafault "inner")
#' @param splits A list, provided when user wants to define split of sample. Indecies of 
#' columns for one half of the split and rest are tanken as second split half (default NA)
#' @param plots A logical, create a dot plot displaying the similarity measures for 
#' ranks provided and indicating the highest value for each measure (default TRUE)
#' @param seed the \code{set.seed} used to select the split-half (default NA)
#' @param rtrue the true rank of the input data if known (default NA)
#' @return A list containing the similarity measures for all methods, the time taken for selection, 
#' the seed used for split-half, the selected rank based on the highest of each similarity measure
#' @examples
#'  result <- opnmfR_ranksel_splithalf()
#' @export
opnmfR_ranksel_splithalf <- function(X, rs, W0=NULL, use.rcpp=TRUE, nrepeat=1, similarity="inner", splits=NA, plots=TRUE, seed=NA, rtrue=NA, ...) {
  # we create folds over the columns
  library(lpSolve) # for solving the assignment problem
  library(aricode) # for ARI
  
  stopifnot(ncol(X)>=max(rs))
  start_time <- Sys.time()
  
  if(!is.na(splits)) {
    stopifnot(typeof(splits) == "list")
    nrepeat <- length(splits)
  }
  stopifnot(nrepeat > 0)
  
  if(is.na(seed)) seed <- sample(1:10^6, 1)
  perf <- list()
  nrun <- 0
  for(n in 1:nrepeat) {
    perf[[n]] <- list()
    perf[[n]]$train_err <- matrix(NA, 2, length(rs))
    perf[[n]]$sim_inner <- rep(NA, length(rs))
    perf[[n]]$sim_cosine <- rep(NA, length(rs))
    perf[[n]]$ari <- rep(NA, length(rs))
    # cor_cosine is from https://www.sciencedirect.com/science/article/pii/S1053811919309395
    perf[[n]]$cor_cosine <- rep(NA, length(rs))
    
    # get folds
    # we need to fold over the columns
    set.seed(seed+n)
    if(!is.na(splits)) {
      idx1 <- splits[[n]]
    } else {
      folds <- chunk(sample(1:ncol(X)), 2)
      idx1 <- folds[[1]]
    }
    idx2 <- setdiff(1:ncol(X), idx1)
    cat("repeat", n, ": #idx1",  length(idx1), ": #idx2",  length(idx2), ":")
    
    fac1 <- list()
    fac2 <- list()
    for(r in 1:length(rs)) {
      # get factors
      if(use.rcpp) {
        fac1[[r]] <- opnmfRcpp(X[,idx1], rs[r], W0=W0, ...)
        fac2[[r]] <- opnmfRcpp(X[,idx2], rs[r], W0=W0, ...)
      } else {
        fac1[[r]] <- opnmfR(X[,idx1], rs[r], W0=W0, ...)
        fac2[[r]] <- opnmfR(X[,idx2], rs[r], W0=W0, ...)
      }
      
      # training error
      perf[[n]]$train_err[1,r] <- fac1[[r]]$mse
      perf[[n]]$train_err[2,r] <- fac2[[r]]$mse
      
      # match two solutions on their W
      # inner product
      sim <- t(fac1[[r]]$W) %*% fac2[[r]]$W
      lp <- lp.assign(sim, direction = "max")
      perf[[n]]$sim_inner[r] <- lp$objval/rs[r]

      # cosine
      sim <- opnmfR_cosine_similarity(t(fac1[[r]]$W), t(fac2[[r]]$W))
      lp <- lp.assign(sim, direction = "max")
      perf[[n]]$sim_cosine[r] <- lp$objval/rs[r]
      
      if(r > 1) {
        # cor_cosine
        sim1 <- opnmfR_cosine_similarity(fac1[[r]]$W, fac1[[r]]$W) 
        sim2 <- opnmfR_cosine_similarity(fac2[[r]]$W, fac2[[r]]$W)
        stopifnot(nrow(sim1) == nrow(fac1[[r]]$W))
        stopifnot(ncol(sim1) == nrow(fac1[[r]]$W))
        cr <- rep(NA, nrow(sim1))
        for(ii in 1:nrow(sim1)) cr[ii] <- cor(sim1[ii,], sim2[ii,])
        perf[[n]]$cor_cosine[r] <- mean(cr, na.rm = TRUE)
        
        # ari
        cl1 <- apply(fac1[[r]]$W, 1, which.max)
        cl2 <- apply(fac2[[r]]$W, 1, which.max)
        stopifnot(length(cl1)==nrow(X))
        perf[[n]]$ari[r] <- ARI(cl1, cl2)
      }
      
      cat("match", perf[[n]]$sim_cosine[r], "\n#######\n")
    } # rs
  } # nrepeat
  
  selection <- opnmfR_ranksel_splithalf_select(perf, rs, plots, rtrue)
  
  end_time <- Sys.time()
  tot_time <- end_time - start_time
  
  return(list(perf=perf, time=tot_time, seed=seed, selection=selection))
}

opnmfR_ranksel_splithalf_select <- function(perf, rs, plots=TRUE, rtrue=NA) {
  
  measures <- names(perf[[1]])
  measures_avg <- matrix(NA, nrow=length(measures), ncol=length(rs))
  rownames(measures_avg) <- measures
  colnames(measures_avg) <- rs
  
  selected <- matrix(NA, nrow = length(measures), ncol=3)
  rownames(selected) <- measures
  colnames(selected) <- c("value", "index", "rank")
  
  for(i in 1:length(measures)) {
    findmax <- TRUE
    if(measures[i]=="train_err") {
      mea <- do.call(rbind, lapply(perf, function(z) apply(z[[measures[i]]], 2, mean)))
      findmax <- FALSE
    } else{
      mea <- do.call(rbind, lapply(perf, function(z) z[[measures[i]]]))
    }
    
    mea <- apply(mea, 2, mean)
    measures_avg[i,] <- mea
  
    # select rank(s)
    if(findmax) {
      selected[i,1] <- max(mea, na.rm=TRUE)
      selected[i,2] <- which.max(mea==selected[i,1])[1]
      selected[i,3] <- rs[selected[i,2]]
    } else {
      selected[i,1] <- min(mea)
      selected[i,2] <- which.min(mea)
      selected[i,3] <- rs[selected[i,2]]
    }
  }
  
  if(plots) {
    par(mfrow=c(1,1))
    startoverlap <- FALSE
    measures_overlap <- c()
    col_overlap <- c()
    for(i in 1:length(measures)) {
      mea <- measures_avg[i,]
      if(measures[i]=="train_err") {
        plot(mea, xlab="Ranks", ylab="Training MSE", main="Split-half", type="b")
      } else {
        mea <- (mea - min(mea, na.rm=TRUE)) / (max(mea, na.rm=TRUE) - min(mea, na.rm=TRUE))
        if(!startoverlap) {
          par(mar=c(5.3, 4.3, 4.3, 8.3), xpd=TRUE)
          plot(x=rs, y=mea, ylim=c(0,1.1), type="b", col=i, xlab="Ranks", ylab="Measure", main="Split-half")
        }
        else points(x=rs, y=mea, ylim=c(0,1.3), type="b", col=i)
        points(selected[i,3], mea[selected[i,2]], cex=1.5+i/10, col=i, pch=10)
        measures_overlap <- c(measures_overlap, measures[i])
        col_overlap <- c(col_overlap, i)
        startoverlap <- TRUE
      }
    }
    if(!is.na(rtrue)) abline(v=rtrue, lty=2, col="gray")
    legend("topright", inset=c(-0.2,0), legend=measures_overlap, col=col_overlap, pch=1)
  }
  
  return(list(measures_avg=measures_avg, selected=selected))
}

#' Permutation based rank selection
#' @param X A matrix, rows are the number of features and columns are number of samples
#' @param rs A vector of ranks to test for selection, 
#' if rs=NULL then \code{1:nrow(X)} is used (default NULL)
#' @param W0 A string or matrix for initialization (default NULL)
#' @param use.rcpp A logical, use \code{opnmfRcpp()} (default TRUE)
#' dimensional data (default TRUE)
#' @param nperm A number, number of permuatations conducted (default 1)
#' @param plots A logical, create a dot plot displaying the similarity measures for 
#' ranks provided and indicating the highest value for each measure (default TRUE)
#' @param seed the \code{set.seed} used to select the split-half (default NA)
#' @param rtrue the true rank of the input data if known (default NA)
#' @return A list containing the approximation matrices W and H, of input matrix
#' X; the interation number at which a solution was found; 
#' the reconstruction error, time taken for rank selection; the seed used for permutation of 
#' input data matrix
#' #' @examples
#'  result <- opnmfR_ranksel_perm()
#' @export
opnmfR_ranksel_perm <- function(X, rs, W0=NULL, use.rcpp=TRUE, nperm=1, plots=TRUE, seed=NA, rtrue=NA, ...) {
  stopifnot(ncol(X)>=max(rs))
  start_time <- Sys.time()
  
  # original data
  cat("original data... ")
  nn <- list()
  mse <- list()
  for(r in 1:length(rs)) {
    if(use.rcpp) {
      nn[[r]] <- opnmfRcpp(X, rs[r], W0=W0, ...)
    } else {
      nn[[r]] <- opnmfR(X, rs[r], W0=W0, ...)
    }
    
    cat("orig", "rank:", r, "iter:", nn[[r]]$iter, "time:", nn[[r]]$time, "diffW:", nn[[r]]$diffW, "\n")

    mse[[r]] <- list()
    mse[[r]]$orig <- opnmfR_mse(X, nn[[r]]$W, nn[[r]]$H)
  }
  cat("done\n")
  
  # permuted data
  cat("permuted data... ")
  if(is.na(seed)) seed <- sample(1:10^6, 1)
  for(p in 1:nperm) {
    set.seed(seed+p)
    Xperm <- apply(X,2,sample) # permute the rows in each column
    for(r in 1:length(rs)) {
      if(use.rcpp) {
        nnp <- opnmfRcpp(Xperm, rs[r], W0=W0, ...)
      } else {
        nnp <- opnmfR(Xperm, rs[r], W0=W0, ...)
      }
    
      cat("perm", p, "rank:", r, "iter:", nnp$iter, "time:", nnp$time,"diffW:", nnp$diffW, "\n")
      mse[[r]]$perm <- c(mse[[r]]$perm, opnmfR_mse(Xperm, nnp$W, nnp$H))
    }
  }
  cat("done\n")
  
  names(mse) <- rs
  names(nn) <- rs
  
  mseorig <- sapply(mse, function(xx) xx$orig)
  mseperm <- sapply(mse, function(xx) mean(xx$perm))
  mseorig <- mseorig / max(mseorig)
  mseperm <- mseperm / max(mseperm)
  
  sel <- which(diff(mseorig) > diff(mseperm))
  selr <- rs[sel]
  
  if(plots) {
    maxi <- max(c(mseorig, mseperm))
    mini <- min(c(mseorig, mseperm))
    
    plot(NA, xlim=c(min(rs),max(rs)), ylim=c(mini,maxi), xlab="Rank", ylab="MSE (scaled)")
    points(rs, mseorig, type='b', pch=16)
    points(rs, mseperm, type='b', pch=17, lty=2)
    points(selr, mseorig[sel], cex=2, pch=1, lwd=2, col="red")
    if(!is.na(rtrue)) abline(v=rtrue, lty=2, col="gray")
    legend("bottomleft", legend = c("Orig.","Perm."), pch=16:17)
    title(main="Permutation", sub=paste("Selected rank = ", min(selr)))
  }
  
  end_time <- Sys.time()
  tot_time <- end_time - start_time
  
  return(list(mse=mse, selected=selr, factorization=nn[sel], time=tot_time, seed=seed))
  
}

#' @export
opnmfR_init <- function(X, r, W0) {
  cat("initializing... r =", r, " ")
  n <- nrow(X)
  if(is.null(W0) || W0=="random") {
    cat("random")
    W0 <- matrix(runif(n*r), ncol = r)
  } else if(is.character(W0) && W0=="nndsvd") {
    cat(W0)
    W0 <- opnmfR_nndsvd(X,r)$W
  } else if(is.character(W0) && W0=="nndsvd1") {
    cat(W0)
    W0 <- opnmfR_nndsvd(X,r,flag = 1)$W
  } else if(is.character(W0) && W0=="nndsvd2") {
    cat(W0)
    W0 <- opnmfR_nndsvd(X,r,flag = 2)$W
  } else if(is.character(W0) && W0=="brainparts") {
    cat(W0)
    W0 <- opnmfR_nndsvd(X,r,epseps=0.1)$W
  }
  cat(" done\n")
  return(W0)
}


# based on matlab code from: http://www.boutsidis.org/software.html
# to make it similar to brainparts set epseps=0.1
#' @export
opnmfR_nndsvd  <- function(x,k,flag=0,seed=NA,s=NULL,eps=0.0000000001,epseps=0) {
  x[is.na(x)] <- 0
  stopifnot(all(x>=0))
  
  if(is.null(s)) s <- svd(x,k,k)
  # the following conditions might not be met when doing ooser
  stopifnot(ncol(s$u)>=k)
  stopifnot(ncol(s$v)>=k)
  
  m <- nrow(x)
  n <- ncol(x)
  W <- matrix(0,nrow=m,ncol=k)
  H <- matrix(0,nrow=k,ncol=n)
  W[,1] <- sqrt(s$d[1])*abs(s$u[,1])
  H[1,] <- sqrt(s$d[1])*abs(s$v[,1])
  
  if(k>1) {
    for(i in seq(2,k)) {
      uu <- s$u[,i]; vv <- s$v[,i]
      uup <- (uu>=0) * uu; uun <- (uu<0) * (-uu)
      vvp <- (vv>=0) * vv; vvn <- (vv<0) * (-vv)
      n_uup <- sqrt(sum(uup^2)); n_uun <- sqrt(sum(uun^2))
      n_vvp <- sqrt(sum(vvp^2)); n_vvn <- sqrt(sum(vvn^2))
      termp <- n_uup*n_vvp; termn <- n_uun*n_vvn
      if(termp >= termn) {
        W[,i] <- sqrt(s$d[i]*termp)*uup/n_uup
        H[i,] <- sqrt(s$d[i]*termp)*vvp/n_vvp
      } else {
        W[,i] <- sqrt(s$d[i]*termn)*uun/n_uun
        H[i,] <- sqrt(s$d[i]*termn)*vvn/n_vvn
      }
    }
  }
  
  # these should be set to 0 but brainparts sets it to 0.1
  W[W<=eps] <- epseps
  H[H<=eps] <- epseps
  
  # these flags have no effect as above we set small numbers to 0.1
  if(flag==1) {
    average <- mean(x)
    W[W==0] <- average
    H[H==0] <- average
  } else if(flag==2) {
    if(is.na(seed)) seed <- sample(1:10^6,1)
    set.seed(seed)
    average <- mean(x)
    ind1 <- W==0
    ind2 <- H==0
    W[ind1] <- average*runif(sum(ind1))/100
    H[ind2] <- average*runif(sum(ind2))/100
  } else if(flag>0) stop("unknown flag in opnmfR_nndsvd")
  
  return(list(W=W, H=H, seed=seed))
  
}
kaurao/opnmfR documentation built on March 12, 2021, 4:15 a.m.