R/nmf_functions_lite.R

Defines functions local.maxima local.minima amariDistance cosineDissMat WcomputeFeatureStats compute_OptKStats_NMF source_NMFtensor_function .onLoad

Documented in amariDistance compute_OptKStats_NMF cosineDissMat source_NMFtensor_function WcomputeFeatureStats

# global reference to tensorflow (will be initialized in .onLoad)
tf <- NULL
np <- NULL

.onLoad <- function(libname, pkgname) {
  # use superassignment to update global reference to tensorflow
  np <<- reticulate::import("numpy", delay_load = TRUE)
  tf <<- reticulate::import("tensorflow", delay_load = TRUE)


}

# Copyright © 2015-2020  The ButchR package contributors
# This file is part of the ButchR package. The ButchR package is licenced
# under GPL-3

#------------------------------------------------------------------------------#
#                         NMF GPU Wrapper - FUNCTIONS                          #
#------------------------------------------------------------------------------#
#' Single view NMF implementations
#'
#' Python functions to run several NMF implementations on tensorflow
#' using the reticulate framework, uses a non-negative matrix as input
#'
#' @param method string with method to use, "NMF", "GRNMF_SC"
#'
#' @return python function to carry out the factorization on tensorflow
#'
#' @import reticulate
#' @examples
#' \dontrun{
#' x <- ButchR:::source_NMFtensor_function("NMF")
#' x
#' }
source_NMFtensor_function <- function(method) {
  NMF_methods_list <- list(NMF      = c("nmf_tensor_lite", "NMF_tensor_py"),
                           GRNMF_SC = c("nmf_tensor_regularized_lite", "NMF_tensor_py"),
                           jNMF     = c("tensor_jNMF_lite", "jNMF_tensor_py"),
                           iNMF     = c("tensor_iNMF_lite", "iNMF_tensor_py"))
  module = NMF_methods_list[[method]]

  # Source NMF tensorflow python script
  path <- file.path(system.file(package = "ButchR"), "python/")
  tensorTheCleaver <- import_from_path("tensorTheCleaver", path = path)
  return(tensorTheCleaver[[module[1]]][[module[2]]])
}


#------------------------------------------------------------------------------#
#                      NMF tensorflow Wrapper - FUNCTION                       #
#------------------------------------------------------------------------------#

#' Single view NMF
#'
#' Computes NMF on tensorflow using the reticulate framework, uses a
#' non-negative matrix as input
#'
#' @param X Input matrix, should be a non-negative matrix
#' @param n_initializations Number of initializations to evaluate
#' @param ranks numeric vector with ranks to factorize
#' @param method method to use in the factorization, available:
#' "NMF", "GRNMF_SC"
#' @param iterations Maximum number of iterations to run for every initialization
#' @param convergence_threshold The factorization stops, if the convergence test
#'  constant for this number of iterations
#' @param n_neighbors for method "GRNMF_SC", the number of neighbors to take
#' into account when building the graph G
#' @param alpha for method "GRNMF_SC", regularization parameter alpha
#' @param lamb for method "GRNMF_SC", regularization parameter lambda
#' @param graph for method "GRNMF_SC", square matrix representing a graph
#' between columns of the input matrix, values correspond to edge weight,
#' if NULL will compute own graph.
#' @param seed Numeric seed to initialize matrices W and H.
#' @param nthreads Number of CPU threads used to perform the NMF decomposition,
#' (in case a GPU is not available).
#' 0 means the system picks an appropriate number.
#' @param extract_features if TRUE performs feature extraction for all f
#' actorization ranks > 2.
#'
#'
#' @return A ButchR_NMF object, containg the factorized matrices W and
#'  H, along with factorization metrics
#' @import reticulate
#' @export
#'
#' @examples
#' nmf_exp <- run_NMF_tensor(matrix(1:1000, ncol = 10),
#'                              ranks = 2:5,
#'                              n_initializations     = 10,
#'                              iterations            = 10^4,
#'                              convergence_threshold = 40)
#' nmf_exp
run_NMF_tensor <- function (
    X,
    ranks,
    method                = "NMF",
    n_initializations     = 10,
    iterations            = 10^4,
    convergence_threshold = 40,
    n_neighbors           = 4,
    alpha                 = 0.1,
    lamb                  = 10,
    graph                 = NULL,
    seed                  = NULL,
    nthreads              = 0,
    extract_features      = FALSE
    ){

  #----------------------------------------------------------------------------#
  #                                    Setup                                   #
  #----------------------------------------------------------------------------#
  # Validate if input data is correct
  X <- val_nonnegative_matrix(X)
  val_ranks_torun(ranks, ncol(X))
  if (!is.character(method)) {
    stop("\nmethod : expecting single character value\n",
         "supported methods: NMF, GRNMF_SC")
  }
  if (!length(method) == 1) {
    stop("\nmethod : expecting single character value\n",
         "supported methods: NMF, GRNMF_SC")
  }
  if (!method %in% c("NMF", "GRNMF_SC")) {
    stop("\nmethod : non valid NMF method\n",
         "supported methods: NMF, GRNMF_SC")
  }
  if (!is.null(seed)) {
    val_single_integer(seed, "seed")
    #message("Only 1 inititialization is supported while using seeded mode.\n",
    #        "Setting n_initializations=1")
    #n_initializations <- 1
    seed <- as.integer(seed)
  }

  val_single_integer(n_initializations, "n_initializations")
  val_single_integer(iterations, "iterations")
  val_single_integer(convergence_threshold, "convergence_threshold")
  val_single_integer(n_neighbors, "n_neighbors")
  val_single_numeric(alpha, "alpha")
  val_single_numeric(lamb, "lamb")
  val_single_numeric(nthreads, "nthreads")
  if (!is.null(graph)) graph <- val_graph_GRNMF_SC(graph, X, method)
  val_single_boolean(extract_features, "extract_features")



  # Convert params to integer
  nmf_params <- lapply(list(ranks                 = ranks,
                            n_initializations     = n_initializations,
                            iterations            = iterations,
                            convergence_threshold = convergence_threshold,
                            n_neighbors           = n_neighbors,
                            nthreads              = nthreads),
                       as.integer)
  names(nmf_params$ranks) <- paste0("k", nmf_params$ranks)

  #----------------------------------------------------------------------------#
  #                Run single view NMF on tensorflow                           #
  #----------------------------------------------------------------------------#
  # Source NMF tensorflow python function
  NMF_tensor_py <- source_NMFtensor_function(method)

  # Run NMF
  complete_eval <- lapply(nmf_params$ranks, function(k) {
    print(Sys.time())
    cat("Factorization rank: ", k, "\n")

    k_eval <- NMF_tensor_py(matrix            = X,
                            rank              = k,
                            n_initializations = nmf_params$n_initializations,
                            iterations        = nmf_params$iterations,
                            seed              = seed,
                            stop_threshold    = nmf_params$convergence_threshold,
                            nthreads          = nmf_params$nthreads,
                            n_neighbors       = nmf_params$n_neighbors,
                            alpha             = alpha,
                            lamb              = lamb,
                            graph             = graph)
    names(k_eval) <- c("W", "H", "iterations", "Frob_error", "W_eval")
    k_eval$iterations <- unlist(k_eval$iterations)
    k_eval$Frob_error <- unlist(k_eval$Frob_error)

    # Optimal K stats
    k_eval$OptKStats <- try(compute_OptKStats_NMF(k_eval, k), silent = FALSE)
    k_eval$W_eval <- NULL

    print(paste("NMF converged after ", paste(k_eval$iterations, collapse = ","), "iterations"))
    return(k_eval)
  })

  #----------------------------------------------------------------------------#
  #                        Build NMF object slots                              #
  #----------------------------------------------------------------------------#
  # input matrix info
  input_matrix <- list(hash       = digest::digest(X),
                       dim        = dim(X),
                       colnames   = colnames(X),
                       rownames   = rownames(X),
                       run_params = list(method            = method,
                                         n_initializations = nmf_params$n_initializations,
                                         iterations        = nmf_params$iterations,
                                         stop_threshold    = nmf_params$convergence_threshold,
                                         n_neighbors       = nmf_params$n_neighbors,
                                         alpha             = alpha,
                                         lamb              = lamb,
                                         extract_features  = extract_features))

  # Frob. error data frame
  frob_errors <- as.data.frame(do.call(cbind, lapply(complete_eval, "[[" , "Frob_error")))


  # Optimal K stats
  OptKStats <- lapply(complete_eval, "[[" , "OptKStats")
  if (!any(sapply(OptKStats, inherits, "try-error"))) {
    OptKStats <- as.data.frame(dplyr::bind_rows(OptKStats))

    # Optimal K
    indCopheneticCoeff <- which(local.maxima(OptKStats$copheneticCoeff)) # Max Cophenetic Coeff
    indMeanAmariDist   <- which(local.minima(OptKStats$meanAmariDist))   # Min Amari Dist
    OptK <- OptKStats$k[intersect(indCopheneticCoeff, indMeanAmariDist)]
    if (length(OptK) == 0) {
      #warning("No optimal K could be determined from the Optimal K stat\n")
      cat("No optimal K could be determined from the Optimal K stat\n")
    }

  } else {
    OptKStats <- data.frame()
    OptK <- integer()
    cat("Error found while computing factorization stats\nSkipping Optimal K\n")
  }

  #----------------------------------------------------------------------------#
  #                  Compute signatures specific features                      #
  #----------------------------------------------------------------------------#
  if (extract_features) {
    SignFeatures <- lapply(lapply(complete_eval, "[[" , "W"), function(W){
      if (ncol(W) == 2) {
        return(NULL)
      } else {
        rownames(W) <- rownames(X)
        return(WcomputeFeatureStats(W))
      }
    })
    SignFeatures <- data.frame(do.call(cbind, SignFeatures),
                               stringsAsFactors = FALSE)
  } else {
    SignFeatures <- data.frame()
  }


  #----------------------------------------------------------------------------#
  #                         Return ButchR_NMF object                           #
  #----------------------------------------------------------------------------#
  ButchR_NMF(input_matrix = input_matrix,
             WMatrix      = lapply(complete_eval, "[[" , "W"),
             HMatrix      = lapply(complete_eval, "[[" , "H"),
             FrobError    = frob_errors,
             OptKStats    = OptKStats,
             OptK         = OptK,
             SignFeatures = SignFeatures)
}

#------------------------------------------------------------------------------#
#             Criteria for optimal factorization rank - FUNCTIONS              #
#------------------------------------------------------------------------------#

#' Computes factorization optimal K stats
#'
#' @param k_eval internal object return after computing NMF with tensorflow
#' @param k factorization rank
#'
compute_OptKStats_NMF <- function(k_eval, k) {
  #----------------------------------------------------------------------------#
  #                            Frobenius error stats                           #
  #----------------------------------------------------------------------------#
  frob_errors <- k_eval[["Frob_error"]]
  min_frobError  <- min(frob_errors, na.rm = TRUE)
  sd_frobError   <- stats::sd(frob_errors, na.rm = TRUE)
  mean_frobError <- mean(frob_errors, na.rm = TRUE)
  cv_frobError   <- sd_frobError / mean_frobError

  #----------------------------------------------------------------------------#
  #     compute Silhouette Width, Cophenetic Coeff and Amari Distances         #
  #----------------------------------------------------------------------------#
  WMatrix_list <- k_eval[["W_eval"]]
  B <- length(WMatrix_list)
  concat_matrix <- do.call(cbind, WMatrix_list)
  dist_matrix   <- cosineDissMat(as.matrix(concat_matrix))

  # compute Silhouette Width
  if (length(WMatrix_list) > 1) {
    #------------------------------------------------------------------------#
    #                         compute Silhouette Width                       #
    #------------------------------------------------------------------------#
    my_pam   <- cluster::pam(dist_matrix, k = ncol(WMatrix_list[[1]]),  diss = TRUE)
    sumSilWidth  <- sum(my_pam$silinfo$widths[, "sil_width"])
    meanSilWidth <- mean(my_pam$silinfo$widths[, "sil_width"])
    #------------------------------------------------------------------------#
    #                         compute Cophenetic Coeff                       #
    #------------------------------------------------------------------------#
    # compute Cophenetic Coeff
    my_hclust <- stats::hclust(stats::as.dist(dist_matrix))
    dist_cophenetic <- as.matrix(stats::cophenetic(my_hclust))
    # take distance matrices without diagonal elements
    diag(dist_matrix) <- NA
    dist_matrix <- dist_matrix[which(!is.na(dist_matrix))]
    diag(dist_cophenetic) <- NA
    dist_cophenetic <- dist_cophenetic[which(!is.na(dist_cophenetic))]
    copheneticCoeff = unlist(stats::cor(cbind(dist_cophenetic, dist_matrix))[1, 2])

    #------------------------------------------------------------------------#
    #                         compute Amari Distances                        #
    #------------------------------------------------------------------------#
    distances_list <- unlist(lapply(1:(B - 1), function(b) {
      distances <- lapply((b + 1):B, function(b.hat) {
        amariDistance(WMatrix_list[[b]], WMatrix_list[[b.hat]])
      })
    }))
    meanAmariDist <- unlist(mean(distances_list))

  } else {
    sumSilWidth     <- NA
    meanSilWidth    <- NA
    copheneticCoeff <- NA
    meanAmariDist   <- NA
  }

  #----------------------------------------------------------------------------#
  #                         Return optimal K stats                             #
  #----------------------------------------------------------------------------#
  data.frame(rank_id = paste0("k", k),
             k       = k,
             FrobError_min   = min_frobError,
             FrobError_mean  = mean_frobError,
             FrobError_sd    = sd_frobError,
             FrobError_cv    = cv_frobError,
             sumSilWidth     = sumSilWidth,
             meanSilWidth    = meanSilWidth,
             copheneticCoeff = copheneticCoeff,
             meanAmariDist   = meanAmariDist,
             stringsAsFactors = FALSE)
}



#------------------------------------------------------------------------------#
#                       Computes Signature specific features                   #
#------------------------------------------------------------------------------#
#' Computes Signature specific features
#'
#' @param W W matrix with more than 2 signatures
#' @examples
#' \dontrun{
#' WcomputeFeatureStats(W)
#' }
WcomputeFeatureStats <- function(W) {
  #----------------------------------------------------------------------------#
  #    find features with no contribution and assign names if not present      #
  #----------------------------------------------------------------------------#
  k = ncol(W)
  if (is.null(rownames(W))) {
    rownames(W) <- paste0("f", 1:nrow(W))
  }
  idx <- rowSums(W) == 0
  # Features that contribute towards one or more signatures
  Wf <- W[!idx,]
  # Non-signature features - features with 0 contribution to all signatures
  nsf <- stats::setNames(rep(paste(rep("0", k), collapse = ""),
                             sum(idx)), rownames(W)[idx])

  #----------------------------------------------------------------------------#
  #      Run k means over all rows and assign features to the clusters         #
  #----------------------------------------------------------------------------#
  ssf <- apply(Wf, 1, function(x){
    x <- sigmoidTransform(x)
    k <- stats::kmeans(x, 2)
    max_idx <- which.max(k$centers)
    #paste(if_else(k$cluster == max_idx, "1", "0"), collapse = "")
    paste(ifelse(k$cluster == max_idx, "1", "0"), collapse = "")
  })

  # Combine with features with no contribution and sort
  sig_features <- c(ssf, nsf)
  sig_features <- sig_features[match(rownames(W), names(sig_features))]
  return(sig_features)
}





#' Create distance matrix with cosine similarity with matrix operations
#'
#' @param in.matrix concat_matrix a concatenated W matrix across multiple initializations
#' @param in.dimension factorization rank
#'
#' @examples
#' \dontrun{
#' # concat_matrix a concatenated W matrix across multiple initializations
#' cosineDissMat(as.matrix(concat_matrix))
#' }
cosineDissMat <- function(in.matrix, in.dimension=2){
  if(in.dimension == 1) in.matrix <- t(in.matrix)
  squaredVectorSum <- apply(in.matrix, 2, function(m) { sqrt(sum(m * m)) })
  squaredVectorProduct <- squaredVectorSum %*% t(squaredVectorSum)
  squaredInputSum <- t(in.matrix) %*% in.matrix
  # sum(a*b) for any a,b in M
  diss.matrix <- 1 - squaredInputSum / squaredVectorProduct
  # CosineDistance = 1 - CosineSimilarity
  return(round(diss.matrix, digits = 14))
}

#' Compute amari type distance between two matrices
#'
#' @param matrix.A,matrix.B of the same dimensionality
#'
#' @return The amari type distance of matrix.A & matrix.B according
#'        to [Wu et. al, PNAS 2016]
#'
#' @references \url{http://www.pnas.org/content/113/16/4290.long}
#' @examples
#' \dontrun{
#' amariDistance(WMatrix_list_a, WMatrix_list_b)
#' }
amariDistance <- function(matrix.A, matrix.B) {
  K <- dim(matrix.A)[2]
  C <- stats::cor(matrix.A, matrix.B)
  return(1 - (sum(apply(C, 1, max)) + sum(apply(C, 2, max))) / (2 * K))
}


local.minima <- function(x)
  ifelse(dplyr::lag(x) >= x & dplyr::lead(x) >= x, TRUE, FALSE)
local.maxima <- function(x)
  ifelse(dplyr::lag(x) <= x & dplyr::lead(x) <= x, TRUE, FALSE)
hdsu-bioquant/ButchR documentation built on Jan. 28, 2023, 6:06 p.m.