R/nmf_functions.R

Defines functions writeTmpMatrix runNmfGpu runNmfGpuPyCuda runNmfCpu computeCpuFrobErrors getFrobError getHMatrixList getWMatrixList getCpuWMatrixList getCpuHMatrixList computeFrobErrorStats computePValue4FrobError cosineSim cosineDist cosineDiss cosineDissMat computeSilhoutteWidth computeCopheneticCoeff amariDistance computeAmariDistances local.minima local.maxima proposeOptK getSignatureNames computeEntropy computeEntropy4OptK computeAbsDelta computeAbsDelta4OptK computeCoefVar performRowKmeans computeFeatureStats getIndex4SignatureCombs computeMeanDiff4SignatureCombs getSignatureCombCounts normalizeW normalizeH regularizeH merge.nmf computeSignatureSpecificFeatures

Documented in amariDistance computeAbsDelta computeAbsDelta4OptK computeAmariDistances computeCoefVar computeCopheneticCoeff computeCpuFrobErrors computeEntropy computeEntropy4OptK computeFeatureStats computeFrobErrorStats computeMeanDiff4SignatureCombs computePValue4FrobError computeSignatureSpecificFeatures computeSilhoutteWidth cosineDiss cosineDissMat cosineDist cosineSim getCpuHMatrixList getCpuWMatrixList getFrobError getHMatrixList getIndex4SignatureCombs getSignatureCombCounts getSignatureNames getWMatrixList merge.nmf normalizeH normalizeW performRowKmeans proposeOptK regularizeH runNmfCpu runNmfGpu runNmfGpuPyCuda writeTmpMatrix

# Copyright © 2015-2017  The Bratwurst package contributors
# This file is part of the Bratwurst package. The Bratwurst package is licenced
# under GPL-3

#==============================================================================#
#                         NMF GPU Wrapper - FUNCTIONS                          #
#==============================================================================#
#' Title
#'
#' @param matrix
#' @param tmp.path
#' @param sep
#'
#' @return
#'
#' @importFrom RcppCNPy npySave
#'
#' @examples
writeTmpMatrix <- function(matrix, tmp.path = "/tmp/nmf_tmp",
                           sep = " ", binary=FALSE) {
  # Write matrix to tmp file.
  matrix <- round(matrix, 7)
  dir.create(tmp.path)
  # Rmv final slash from path --> will lead to an error of nmfGpu wrapper
  tmp.path <- sub("/$", "", tmp.path)

  if(!binary){
    tmpMatrix.path <- tempfile(tmpdir = tmp.path, fileext = ".txt")
    write.table(matrix, file = tmpMatrix.path, quote = F,
                col.names = F, row.names = F, sep = sep)
  } else {
    tmpMatrix.path <- tempfile(tmpdir = tmp.path, fileext = ".npy")
    npySave(tmpMatrix.path, matrix)
  }
  return(tmpMatrix.path)
}

#' Title
#'
#' @param nmf.exp
#' @param k.min
#' @param k.max
#' @param outer.iter
#' @param inner.iter
#' @param conver.test.niter
#' @param conver.test.stop.threshold
#' @param out.dir
#'
#' @return
#'
#' @import SummarizedExperiment
#' @importFrom S4Vectors DataFrame
#' @importFrom data.table fread
#' @export
#'
#' @examples
runNmfGpu <- function(nmf.exp, k.min= 2, k.max = 2, outer.iter = 10,
                      inner.iter = 10^4, conver.test.niter = 10,
                      conver.test.stop.threshold = 40, out.dir = NULL,
                      tmp.path = "/tmp/nmf_tmp") {
  # Write raw matrix to tmp file
  tmpMatrix.path <- writeTmpMatrix(assay(nmf.exp, "raw"), tmp.path = tmp.path)
  # Define pattern to finde GPU_NMF output.
  tmp.dir <- dirname(tmpMatrix.path)
  h.pattern <- sprintf("%s_H.txt", basename(tmpMatrix.path))
  w.pattern <- sprintf("%s_W.txt", basename(tmpMatrix.path))
  # Check if deconv. matrix should be saved
  if(!is.null(out.dir)) dir.create(out.dir)
  # RUN NMF.
  dec.matrix <- lapply(k.min:k.max, function(k) {
    print(Sys.time())
    cat("Factorization rank: ", k, "\n")
    k.matrix <- lapply(1:outer.iter, function(i) {
      if(i %% 10 == 0) { cat("\tIteration: ", i, "\n") }
      frob.error <- 1
      while(frob.error == 1 | is.na(frob.error)){
        # VERSION 1.0
        # nmf.cmd <-
        #   sprintf('NMF_GPU %s -k %s -i %s', tmpMatrix.path, k, inner.iter)
        # nmf.stdout <- system(nmf.cmd, intern = T)
        nmf.cmd <- sprintf("%s -k %i -i %i -j %i -t %i", tmpMatrix.path, k,
                           inner.iter, conver.test.niter,
                           conver.test.stop.threshold)
        nmf.stdout <-
          system2("NMF_GPU", args = nmf.cmd, stdout = T, stderr = NULL)
        frob.error <- nmf.stdout[grep(nmf.stdout, pattern = "Distance")]
        frob.error <- as.numeric(sub(".*: ", "", frob.error))
        if(length(frob.error) == 0) frob.error <- 1
      }
      h.file <- list.files(tmp.dir, pattern = h.pattern, full.names = T)
      h.matrix <- fread(h.file, header = F)
      w.file <- list.files(tmp.dir, pattern = w.pattern, full.names = T)
      w.matrix <- fread(w.file, header = F)
      if(!is.null(out.dir)) {
        file.copy(h.file,
                  file.path(out.dir, sprintf("H_k%s_iter%s.txt", k, i)))
        file.copy(w.file,
                  file.path(out.dir, sprintf("W_k%s_iter%s.txt", k, i)))
      }
      file.remove(c(h.file, w.file))
      return(list(H = h.matrix,
                  W = w.matrix,
                  Frob.error = frob.error))
    })
    names(k.matrix) <- 1:outer.iter
    return(k.matrix)
  })
  names(dec.matrix) <- k.min:k.max
  ### Add NMF results to summarizedExp object.
  # Frob Errors
  frob.errors <- DataFrame(getFrobError(dec.matrix))
  colnames(frob.errors) <- as.character(k.min:k.max)
  nmf.exp <- setFrobError(nmf.exp, frob.errors)
  # H-Matrix List
  nmf.exp <- setHMatrixList(nmf.exp, getHMatrixList(dec.matrix))
  # W-Matrix List
  nmf.exp <- setWMatrixList(nmf.exp, getWMatrixList(dec.matrix))
  return(nmf.exp)
}

#' Title
#'
#' @param nmf.exp
#' @param k.min
#' @param k.max
#' @param outer.iter
#' @param inner.iter
#' @param conver.test.niter
#' @param conver.test.stop.threshold
#' @param out.dir
#'
#' @return
#'
#' @import SummarizedExperiment
#' @import NMF
#' @importFrom data.table fread
#' @importFrom S4Vectors DataFrame
#' @importFrom RcppCNPy npyLoad
#' @export
#'
#' @examples
runNmfGpuPyCuda <- function(nmf.exp, k.min= 2, k.max = 2, outer.iter = 10,
                            inner.iter = 10^4, conver.test.niter = 10,
                            conver.test.stop.threshold = 40, out.dir = NULL,
                            tmp.path = "/tmp/nmf_tmp", nmf.type = "N",
                            w.sparsness = 0, h.sparsness = 0, gpu.id = 0,
                            seed = FALSE, binary.file.transfer = FALSE,
                            cpu = FALSE) {
  if(!cpu){
    # Write raw matrix to tmp file
    tmpMatrix.path <- writeTmpMatrix(assay(nmf.exp, "raw"), tmp.path = tmp.path,
                                     binary = binary.file.transfer)
    # Define encoding
    encoding <- "txt"
    if(binary.file.transfer)
      encoding <- "npy"
    # Define pattern to finde GPU_NMF output.
    tmp.dir <- dirname(tmpMatrix.path)
    h.pattern <- sprintf("%s_H.%s", basename(tmpMatrix.path), encoding)
    w.pattern <- sprintf("%s_W.%s", basename(tmpMatrix.path), encoding)
    # Check if deconv. matrix should be saved
    if(!is.null(out.dir)) dir.create(out.dir)
    # RUN NMF.
    dec.matrix <- lapply(k.min:k.max, function(k) {
      print(Sys.time())
      cat("Factorization rank: ", k, "\n")
      k.matrix <- lapply(1:outer.iter, function(i) {
        if(i %% 10 == 0) { cat("\tIteration: ", i, "\n") }
        frob.error <- 1
        if (seed){
          nmf.cmd <-
            sprintf(paste0("%s %s -k %i -i %i -s %s -wo %s -ho %s ",
                           "-g %i -e %s -sets %s -sv %i"),
                    file.path(system.file(package = "Bratwurst"),
                              "python/nmf_mult.py"),
                    tmpMatrix.path, k, inner.iter, nmf.type, w.sparsness,
                    h.sparsness, gpu.id, encoding, "True", i)
          nmf.stdout <- system2("python", args = nmf.cmd, stdout = T,
                                stderr = NULL)
          frob.error <- nmf.stdout[grep(nmf.stdout, pattern = "Distance")]
          frob.error <- as.numeric(sub(".*: ", "", frob.error))
        } else {
          count <-  1
          while((frob.error == 1 | is.na(frob.error)) & count < 11){
            count <- count + 1
            nmf.cmd <-
              sprintf(paste0("%s %s -k %i -i %i -s %s -wo %s -ho %s ",
                             "-g %i -e %s"),
                      file.path(system.file(package = "Bratwurst"),
                                "python/nmf_mult.py"),
                      tmpMatrix.path, k, inner.iter, nmf.type, w.sparsness,
                      h.sparsness, gpu.id, encoding)
            nmf.stdout <- system2("python", args = nmf.cmd, stdout = T,
                                  stderr = NULL)
            frob.error <- nmf.stdout[grep(nmf.stdout, pattern = "Distance")]
            frob.error <- as.numeric(sub(".*: ", "", frob.error))
            if(length(frob.error) == 0) frob.error <- 1
          }
        }
        h.file <- list.files(tmp.dir, pattern = h.pattern, full.names = T)
        w.file <- list.files(tmp.dir, pattern = w.pattern, full.names = T)
        if(!binary.file.transfer){
          h.matrix <- fread(h.file, header = F)
          w.matrix <- fread(w.file, header = F)
        } else {
          h.matrix <- npyLoad(h.file)
          w.matrix <- npyLoad(w.file)
        }
        if(!is.null(out.dir)) {
          file.copy(h.file, file.path(out.dir, sprintf("H_k%s_iter%s.%s",
                                                       k, i, encoding)))
          file.copy(w.file, file.path(out.dir, sprintf("W_k%s_iter%s.%s",
                                                       k, i, encoding)))
        }
        file.remove(c(h.file, w.file))
        return(list(H = h.matrix,
                    W = w.matrix,
                    Frob.error = frob.error))
      })
      names(k.matrix) <- 1:outer.iter
      return(k.matrix)
    })
    names(dec.matrix) <- k.min:k.max
    ### Add NMF results to summarizedExp object.
    # Frob Errors
    frob.errors <- DataFrame(getFrobError(dec.matrix))
    colnames(frob.errors) <- as.character(k.min:k.max)
    nmf.exp <- setFrobError(nmf.exp, frob.errors)
    # H-Matrix List
    nmf.exp <- setHMatrixList(nmf.exp, getHMatrixList(dec.matrix))
    # W-Matrix List
    nmf.exp <- setWMatrixList(nmf.exp, getWMatrixList(dec.matrix))
  } else {
    # run NMF on CPU via CRAN R package
    # cpu.nmf.list <- runNmfCpu(nmf.exp, k.min = k.min, k.max = k.max,
    #                           outer.iter = outer.iter)
    cpu.nmf.list <- runNmfCpu(nmf.exp, k.min = k.min, k.max = k.max,
                              nrun = outer.iter, seed = seed)
    ### Add NMF results to summarizedExp object
    # compute Frob Erros and add them to nmf.exp
    frob.errors <- DataFrame(computeCpuFrobErrors(nmf.exp, cpu.nmf.list,
                                                  k.min, k.max))
    colnames(frob.errors) <- as.character(k.min:k.max)
    nmf.exp <- setFrobError(nmf.exp, frob.errors)
    # H-Matrix List
    nmf.exp <- setHMatrixList(nmf.exp, getCpuHMatrixList(cpu.nmf.list,
                                                         k.min, k.max))
    # W-Matrix List
    nmf.exp <- setWMatrixList(nmf.exp, getCpuWMatrixList(cpu.nmf.list,
                                                         k.min, k.max))
  }
 return(nmf.exp)
}

#' Title
#'
#' @param nmf.exp
#' @param k.min
#' @param k.max
#' @param outer.iter
#'
#' @return
#'
#' @import NMF
#'
#' @examples
#runNmfCpu <- function(nmf.exp, k.min = 2, k.max = 2, outer.iter = 10){
runNmfCpu <- function(nmf.exp, k.min = 2, k.max = 2, seed = FALSE, ...){
    ## create ExpressionSet
  eset <- new("ExpressionSet", exprs = assay(nmf.exp))

  # loop over all k and outer.iter iterations
  cpu.nmf.list <- lapply(k.min:k.max, function(k){
    print(Sys.time())
    cat("Factorization rank: ", k, "\n")
    if(seed){
      #cpu.nmf <- nmf(eset, rank = k, method = "brunet", nrun = outer.iter,
      cpu.nmf <- nmf(eset, rank = k, ...,
                     .options = list(keep.all = TRUE), seed = 12)
    }else {
      #cpu.nmf <- nmf(eset, rank = k, method = "brunet", nrun = outer.iter,
      cpu.nmf <- nmf(eset, rank = k, ...,
                     .options = list(keep.all = TRUE))
    }

    return(cpu.nmf)
  })

  return(cpu.nmf.list)
}

#' Title
#'
#' @param nmf.exp
#' @param k.min
#' @param k.max
#' @param outer.iter
#'
#' @return
#'
#' @import NMF
#'
#' @examples
computeCpuFrobErrors <- function(nmf.exp, cpu.nmf.list, k.min, k.max){
  frob.errors <- do.call(cbind, lapply(seq(length(k.min:k.max)), function(ki){
    # get all W-matrices
    W.list <- lapply(cpu.nmf.list[[ki]], basis)
    # get all H-matrices
    H.list <- lapply(cpu.nmf.list[[ki]], coef)
    fes <- sapply(seq(length(W.list)), function(oi){
      fe <- norm(assay(nmf.exp) - (W.list[[oi]] %*% H.list[[oi]]), type = "F") /
        norm(assay(nmf.exp), type = "F")
    })
    return(fes)
  }))

  return(frob.errors)
}


#==============================================================================#
#                               Getter FUNCTIONS                               #
#==============================================================================#
#' Getter function for FrobError from NMF-GPU list
#'
#' @param dec.matrix
#'
#' @return
#'
#' @importFrom reshape2 melt
#' @importFrom reshape2 dcast
#'
#' @examples
getFrobError <- function(dec.matrix) {
  frob.errorList <- lapply(dec.matrix, function(dec.m){
    lapply(dec.m, function(m) m$Frob.error)
  })
  frob.errorMatrix <- melt(frob.errorList)
  frob.errorMatrix <- dcast(frob.errorMatrix, L2 ~ L1, value.var = "value")
  frob.errorMatrix <-
    frob.errorMatrix[order(as.numeric(frob.errorMatrix$L2)), ]
  frob.errorMatrix <- frob.errorMatrix[, -1]
  frob.errorMatrix <-
    frob.errorMatrix[, order(as.numeric(colnames(frob.errorMatrix)))]
  frob.errorMatrix[frob.errorMatrix == 1] <- NA
  return(frob.errorMatrix)
}

#' Getter function for H-Matrix list from NMF-GPU list
#'
#' @param dec.matrix
#'
#' @return
#'
#' @examples
getHMatrixList <- function(dec.matrix) {
  # Extract all entries for H and return.
  H.list <- lapply(dec.matrix, function(k.matrix) {
    H <- lapply(k.matrix, function(m) as.matrix(m$H))
    return(H)
  })
  return(H.list)
}

#' Getter function for W-Matrix list from NMF-GPU list
#'
#' @param dec.matrix
#'
#' @return
#'
#' @examples
getWMatrixList <- function(dec.matrix) {
  # Extract all entries for W and return.
  W.list <- lapply(dec.matrix, function(k.matrix) {
    W <- lapply(k.matrix, function(m) as.matrix(m$W))
    return(W)
  })
  return(W.list)
}

#' Getter function for W-Matrix list from NMF-CPU list
#'
#' @param dec.matrix
#'
#' @return
#'
#' @import NMF
#'
#' @examples
getCpuWMatrixList <- function(cpu.nmf.list, k.min, k.max) {
  # Extract all entries for W and return.
  W.list <- lapply(cpu.nmf.list, function(cpu.nmf) {
    W <- lapply(cpu.nmf, basis)
    return(W)
  })
  names(W.list) <- as.character(k.min:k.max)
  return(W.list)
}

#' Getter function for H-Matrix list from NMF-CPU list
#'
#' @param dec.matrix
#'
#' @return
#'
#' @examples
getCpuHMatrixList <- function(cpu.nmf.list, k.min, k.max) {
  # Extract all entries for H and return.
  H.list <- lapply(cpu.nmf.list, function(cpu.nmf) {
    H <- lapply(cpu.nmf, coef)
    return(H)
  })
  names(H.list) <- as.character(k.min:k.max)
  return(H.list)
}

#==============================================================================#
#             Criteria for optimal factorization rank - FUNCTIONS              #
#==============================================================================#
#' Compute basic statistics for Frobenius Errors
#'
#' @param nmf.exp
#'
#' @return
#' @export
#'
#' @examples
computeFrobErrorStats <- function(nmf.exp) {
  frob.errorMatrix <- as.matrix(FrobError(nmf.exp))
  min.frobError <- apply(frob.errorMatrix, 2, function(x) min(x, na.rm = T))
  sd.frobError <- apply(frob.errorMatrix, 2, function(x) sd(x, na.rm = T))
  mean.frobError <- colMeans(frob.errorMatrix, na.rm = T)
  cv.frobError <- sd.frobError / mean.frobError
  frobError.data <- DataFrame("k" = as.numeric(names(min.frobError)),
                              "min" = min.frobError,
                              "mean" = mean.frobError,
                              "sd" = sd.frobError,
                              "cv" = cv.frobError)
  nmf.exp <- setOptKStats(nmf.exp, frobError.data)
  return(nmf.exp)
}

#' Compute p-value with t-test for running K
#'
#' @param frobError.matrix
#'
#' @return
#' @export
#'
#' @examples
computePValue4FrobError <- function(frobError.matrix) {
  n.col <- ncol(frobError.matrix)
  p <- mapply(1:(n.col - 1), 2:n.col, FUN = function(i, j) {
    p <- t.test(frobError.matrix[, i], frobError.matrix[, j])
    return(p$p.value)
  })
  #p <- p.adjust(unlist(p), method = 'BH')
  return(-log10(unlist(p)))
}

#' Cosine similarity
#'
#' @param a
#' @param b
#'
#' @return
#' @export
#'
#' @examples
cosineSim <- function(a, b){
  return(sum(a * b) / ( sqrt(sum(a * a)) * sqrt(sum(b * b)) ))
}

#' Cosine distance
#'
#' @param a
#' @param b
#'
#' @return
#' @export
#'
#' @examples
cosineDist <- function(a, b){
  return(1 - cosineSim(a, b))
}

#' Create distance matrix with cosine similarity
#'
#' @param in.matrix
#' @param in.dimension
#'
#' @return
#' @export
#'
#' @examples
cosineDiss <- function(in.matrix, in.dimension=2){
  if(in.dimension == 1) in.matrix <- t(in.matrix)
  cosineDist.list <- lapply(1:ncol(in.matrix), function(i.outCol) {
    cosine.dists <- unlist(lapply(1:ncol(in.matrix), function(i.innCol) {
      cosineDist(in.matrix[, i.outCol], in.matrix[, i.innCol])
    }))
  })
  diss.matrix <- do.call(rbind, cosineDist.list)
  return(round(diss.matrix, digits = 14))
}

#' Create distance matrix with cosine similarity with matrix operations
#'
#' @param in.matrix
#' @param in.dimension
#'
#' @return
#' @export
#'
#' @examples
cosineDissMat <- function(in.matrix, in.dimension=2){
  if(in.dimension == 1) in.matrix <- t(in.matrix)
  squaredVectorSum <- apply(in.matrix, 2, function(m) { sqrt(sum(m * m)) })
  squaredVectorProduct <- squaredVectorSum %*% t(squaredVectorSum)
  squaredInputSum <- t(in.matrix) %*% in.matrix
  # sum(a*b) for any a,b in M
  diss.matrix <- 1 - squaredInputSum / squaredVectorProduct
  # CosineDistance = 1 - CosineSimilarity
  return(round(diss.matrix, digits = 14))
}

#' Compute Alexandrov Criterion --> Silhoutte Width
#'
#' @param nmf.exp
#'
#' @return
#'
#' @importFrom cluster pam
#' @export
#'
#' @examples
computeSilhoutteWidth <- function(nmf.exp) {
  sil.vec <- lapply(WMatrixList(nmf.exp), function(WMatrix.list) {
    nan.bool <- sapply(WMatrix.list, function(m) !any(is.nan(m)))
    concat.matrix <- do.call(cbind, WMatrix.list[which(nan.bool)])
    dist.matrix <- cosineDissMat(as.matrix(concat.matrix))
    my.pam <- pam(dist.matrix, k = ncol(WMatrix.list[[1]]),  diss = T)
    sil.sum <- sum(my.pam$silinfo$widths[, "sil_width"])
    sil.mean <- mean(my.pam$silinfo$widths[, "sil_width"])
    return(DataFrame(sumSilWidth = sil.sum,
                     meanSilWidth = sil.mean))
  })
  sil.vec <- do.call(rbind, sil.vec)
  nmf.exp <- setOptKStats(nmf.exp, cbind(OptKStats(nmf.exp), sil.vec))
  return(nmf.exp)
}

#' Compute Cophenetic correlation coefficient, TO BE IMPROVED
#'
#' @param nmf.exp
#'
#' @return
#' @export
#'
#' @examples
computeCopheneticCoeff <- function(nmf.exp) {
  coph.coeff <- lapply(WMatrixList(nmf.exp), function(WMatrix.list) {
    nan.bool <- sapply(WMatrix.list, function(m) !any(is.nan(m)))
    concat.matrix <- do.call(cbind, WMatrix.list[which(nan.bool)])
    #concat.matrix <- do.call(cbind, WMatrix.list)
    dist.matrix <- cosineDissMat(as.matrix(concat.matrix))
    my.hclust <- hclust(as.dist(dist.matrix))
    dist.cophenetic <- as.matrix(cophenetic(my.hclust))
    # take distance matrices without diagonal elements
    diag(dist.matrix) <- NA
    dist.matrix <- dist.matrix[which(!is.na(dist.matrix))]
    diag(dist.cophenetic) <- NA
    dist.cophenetic <- dist.cophenetic[which(!is.na(dist.cophenetic))]
    return(cor(cbind(dist.cophenetic, dist.matrix))[1, 2])
  })
  coph.coeff <- DataFrame(copheneticCoeff = unlist(coph.coeff))
  nmf.exp <- setOptKStats(nmf.exp, cbind(OptKStats(nmf.exp), coph.coeff))
  return(nmf.exp)
}

#' Compute amari type distance between two matrices
#'
#' @param matrix.A,matrix.B of the same dimensionality
#'
#' @return The amari type distance of matrix.A & matrix.B according
#'        to [Wu et. al, PNAS 2016]
#'
#' @references \url{http://www.pnas.org/content/113/16/4290.long}
#'
#' @export
#'
#' @examples
amariDistance <- function(matrix.A, matrix.B) {
  K <- dim(matrix.A)[2]
  C <- cor(matrix.A, matrix.B)
  return(1 - (sum(apply(C, 1, max)) + sum(apply(C, 2, max))) / (2 * K))
}

#' Compute Amari Distances from [Wu et. al, PNAS 2016]
#'
#' @param nmf.exp
#'
#' @return The average Amari-type error for each k
#'
#' @references \url{http://www.pnas.org/content/113/16/4290.long}
#'
#' @export
#'
#' @examples
computeAmariDistances <- function(nmf.exp){
  distance.averages <- lapply(WMatrixList(nmf.exp), function(matrices) {
    B <- length(matrices)
    distances.list <- unlist(lapply(1:(B - 1), function(b) {
      distances <- lapply((b + 1):B, function(b.hat) {
        amariDistance(matrices[[b]], matrices[[b.hat]])
      })
    }))
    return(mean(distances.list[!is.na(distances.list)]))
    # is.na to exclude corrupted matrices
  })
  distance.averages <- DataFrame(meanAmariDist = unlist(distance.averages))
  nmf.exp <- setOptKStats(nmf.exp, cbind(OptKStats(nmf.exp), distance.averages))
  return(nmf.exp)
}

local.minima <- function(x)
  ifelse(dplyr::lag(x) >= x & dplyr::lead(x) >= x, TRUE, FALSE)
local.maxima <- function(x)
  ifelse(dplyr::lag(x) <= x & dplyr::lead(x) <= x, TRUE, FALSE)


#' Title
#'
#' @param nmf.exp
#' @param verbose
#'
#' @return
#' @export
#'
#' @examples
proposeOptK <- function(nmf.exp, verbose = FALSE){
  OptKStats_DF <- OptKStats(nmf.exp)
  if(ncol(OptKStats_DF) & nrow(OptKStats_DF)){
    indCopheneticCoeff <- which(local.maxima(OptKStats_DF$copheneticCoeff))
    indMeanAmariDist <- which(local.minima(OptKStats_DF$meanAmariDist))
    return(list(
      intersectK = OptKStats_DF$k[intersect(indCopheneticCoeff,
                                            indMeanAmariDist)],
      unionK = OptKStats_DF$k[union(indCopheneticCoeff, indMeanAmariDist)]))
  } else {
    if(verbose) cat("proposeOptK::warning:OptKStats not yet computed.\n")
    return(NULL)
  }
}

#==============================================================================#
#                         H-MATRIX ANALYSIS FUNCTIONS                          #
#==============================================================================#
# Compute unbaised signature names by applying a row k-means
# and classify signature according to cluster and highest cluster mean.
#' Compute unsupervised signature names from colData and H-Matrix for given K
#'
#' @param nmf.exp
#' @param k.opt
#'
#' @return
#' @export
#'
#' @examples
getSignatureNames <- function(nmf.exp, k.opt) {
  H <- HMatrix(nmf.exp, k = k.opt)
  sig.names <- lapply(1:nrow(H), function(i) {
    k.cluster <- kmeans(as.numeric(H[i, ]), 2)
    n.kCluster <- which(k.cluster$centers == max(k.cluster$centers))
    samples.kCluster <- which(k.cluster$cluster == n.kCluster)
    meta.data <- colData(nmf.exp)[samples.kCluster, ]
    n.col <- ncol(meta.data)
    if (n.col > 1) {
      sigName.combs <- apply(as.data.frame(meta.data), 1, function(x){
        paste(x[2:n.col], collapse = " ")
      })
      sigName.combs <- sort(table(sigName.combs), decreasing = T)
      sig.name <- names(sigName.combs)[1]
      # TODO: Might be improvable by using exposures from H-Matrix.
      # Exposure proportion computation!?
      if(length(sigName.combs) > 1) {
        sig.prop <- round(sigName.combs / sum(sigName.combs), 2)
        sig.prop <- paste(names(sig.prop), sig.prop)
        sig.name <- paste(sig.prop, collapse = "\n")
      }
    } else {
      sig.name <- sprintf("Signature %s", i)
    }
    return(sig.name)
  })
  return(unlist(sig.names))
}


#==============================================================================#
#                         W-MATRIX ANALYSIS FUNCTIONS:                         #
#                             FEATURE SELECTION                                #
#==============================================================================#
#' Compute 'shannon' entropy per region.
#' High Entropy means highly specific for one signature.
#'
#' @param matrix
#'
#' @return
#'
#' @examples
computeEntropy <- function(matrix) {
  matrix.relativ <- t(apply(matrix, 1, function(x) x / sum(x)))
  matrix.entropy <- apply(matrix.relativ, 1, function(x) {
    p <- x * log2(length(x) * x)
    p[is.na(p)] <- 0
    h <- sum(p)
    return(h)
  })
  return(matrix.entropy)
}

#' Computes entropy for each feature in optimal K W-matrix
#'
#' @param nmf.exp
#'
#' @return
#' @export
#'
#' @examples
computeEntropy4OptK <- function(nmf.exp) {
  W <- WMatrix(nmf.exp, OptK(nmf.exp))
  W.entropy <- computeEntropy(W)
  nmf.exp <- setFeatureStats(nmf.exp, DataFrame("entropy" = W.entropy))
  return(nmf.exp)
}

#' Compute delta between each column (signature) per row
#'
#' @param matrix
#'
#' @return
#'
#' @examples
computeAbsDelta <- function(matrix) {
  delta.regions <- lapply(1:ncol(matrix), function(k) {
    delta.vec <- matrix[, k] - (rowSums(matrix[, -k]))
    return(delta.vec)
  })
  delta.regions <- do.call(cbind, delta.regions)
  return(delta.regions)
}

#' Computes absolut delta per feature for each signature given optimal K
#'
#' @param nmf.exp
#'
#' @return
#' @export
#'
#' @examples
computeAbsDelta4OptK <- function(nmf.exp) {
  W <- WMatrix(nmf.exp, OptK(nmf.exp))
  W.absDelta <- computeAbsDelta(W)
  nmf.exp <- setFeatureStats(nmf.exp, DataFrame("absDelta" = W.absDelta))
  return(nmf.exp)
}

#' Compute the coeficient of variation per row in a matrix.
#'
#' @param matrix
#'
#' @return
#'
#' @examples
computeCoefVar <- function(matrix) {
  apply(matrix, 1, function(r) sd(r) / mean(r))
}

#' Perform Kmeans on rows of a matrix to classify them into column (singature) combinations
#'
#' @param matrix
#'
#' @return
#'
#' @importFrom cluster silhouette
#' @export
#'
#' @examples
performRowKmeans <- function(matrix) {
  k.row <- apply(matrix, 1, function(x) {
    # Perform sigmoidal transformation to achieve better clustering
    x.trans <- sigmoidTransform(x)
    k.cluster <- kmeans(x.trans, 2)
    d <- dist(as.data.frame(x.trans))
    sil.mean <- mean(silhouette(k.cluster$cluster, d)[, 3])
    cluster.deltaMean <- mean(x[k.cluster$cluster == 1]) -
      mean(x[k.cluster$cluster == 2])
    return(list(centers = t(k.cluster$centers),
                silmean = sil.mean,
                explainedVar = k.cluster$betweenss / k.cluster$totss,
                oddsVar = sum(k.cluster$withinss) / k.cluster$betweenss,
                attribution = k.cluster$cluster,
                deltaMean = cluster.deltaMean))
  })
  return(k.row)
}

#' Compute Signature Combinations for W-matrix given optimal K
#'
#' @param nmf.exp
#' @param var.thres
#'
#' @return
#' @export
#'
#' @examples
computeFeatureStats <- function(nmf.exp, var.thres = 0.25) {
  W <- WMatrix(nmf.exp, OptK(nmf.exp))
  # Determine Region contributions
  k.row <- performRowKmeans(W)
  # Get kmeans stats
  k.explainedVar <- unlist(lapply(k.row, function(r) r$explainedVar))
  k.oddsVar <- unlist(lapply(k.row, function(r) r$oddsVar))
  # Extract mean silhouette width for each row.
  k.silmean <- unlist(lapply(k.row, function(r) r$silmean))
  # Compute delta between cluster centers (transformed data!)
  k.deltaCenter <-
    unlist(lapply(k.row, function(r) r$centers[1] - r$centers[2]))
  # Extract difference of cluster means
  k.deltaMean <- unlist(lapply(k.row, function(r) r$deltaMean))
  # Extract Signature combinations generated by k-means
  k.attribution <- lapply(k.row, function(r) abs(r$attribution))
  k.attribution <- do.call(rbind, k.attribution)
  k.ids <- apply(k.attribution, 1, function(r) paste(r, collapse = ""))
  # Determine regions which contribute to all signatures.
  k.varCoef <- computeCoefVar(W)
  all.signature <- which(k.varCoef < var.thres)
  k.ids[all.signature] <- gsub("2", "1", k.ids[all.signature])
  # Set FeatureStats
  feature.stats <- DataFrame("cluster" = k.ids,
                             "deltaCenters" = k.deltaCenter,
                             "deltaMean" = k.deltaMean,
                             "explainedVar" = k.explainedVar,
                             "oddsVar" = k.oddsVar,
                             "coefVar" = k.varCoef,
                             "meanSil" = k.silmean)
  # Map reverse cluster definitions to each other,
  # including sign change for delta means
  ids <- sort(unique(k.ids))[-1] ## this is not robust!!
  ## must be replaced by
  # equalPattern <- paste(rep("1", ncol(W)), collapse = "")
  # ids <- setdiff(sort(unique(k.ids)), equalPattern)
  ids1 <- ids[1:(length(ids) / 2)]
  ids2 <-  gsub("0", "2", gsub("2", "1", gsub("1", "0", ids1)))
  conv.id <- data.frame("id1" = ids1, "id2" = ids2)
  i.conv <- match(feature.stats$cluster, conv.id$id2)
  feature.stats$cluster[!is.na(i.conv)] <-
    as.character(conv.id$id1[i.conv[!is.na(i.conv)]])
  feature.stats$deltaMean[!is.na(i.conv)] <-
    (-1) * feature.stats$deltaMean[!is.na(i.conv)]
  feature.stats$deltaCenters[!is.na(i.conv)] <-
    (-1) * feature.stats$deltaCenters[!is.na(i.conv)]
  # Re-write cluster ids in a more useful binary code
  i <- which(feature.stats$deltaCenters > 0 &
               grepl(feature.stats$cluster, pattern = "2"))
  feature.stats$cluster[i] <- gsub("2", "0", feature.stats$cluster[i])
  i <- which(feature.stats$deltaCenters < 0 &
               grepl(feature.stats$cluster, pattern = "2"))
  feature.stats$cluster[i] <-
    gsub("2", "1", gsub("1", "0", feature.stats$cluster[i]))
  # Add featureStats to NMF experiment
  nmf.exp <- setFeatureStats(nmf.exp, feature.stats)
  return(nmf.exp)
}

#' Title
#'
#' @param k.ids
#'
#' @return
#' @export
#'
#' @examples
getIndex4SignatureCombs <- function(k.ids) {
  # Get for all signature combinations corresponding regions.
  ids <- sort(unique(k.ids))[-1]
  n.ids <- length(ids) + 1
  i.regions <- lapply(1:(n.ids / 2), function(i) {
    sub.id <- ids[c(i, n.ids - i)]
    i.region <- which(k.ids %in% sub.id)
    return(i.region)
  })
  names(i.regions) <- ids[1:(n.ids / 2)]
  # Get regions for all signature id.
  allSig.id <- sort(unique(k.ids))[1]
  i.regions[[allSig.id]] <- which(k.ids %in% allSig.id)
  return(i.regions)
}

#' Title
#'
#' @param W
#' @param i.regions
#'
#' @return
#' @export
#'
#' @examples
computeMeanDiff4SignatureCombs <- function(W, i.regions) {
  # Compute Row mean diff between Cluster 1 and 2.
  w.diffs <- lapply(1:length(i.regions), function(i) {
    w <- W[i.regions[[i]], ]
    if (!grepl(names(i.regions)[i], pattern = "2")) {
      w.diff <- rowMeans(w)
    } else {
      signature.comb <-
        as.numeric(unlist(strsplit(names(i.regions)[[i]], split = "")))
      w.mean1 <- rowMeans(w[, which(signature.comb == 1)])
      w.mean2 <- rowMeans(w[, which(signature.comb == 2)])
      w.diff <- w.mean1 - w.mean2
    }
    return(w.diff)
  })
  names(w.diffs) <- names(i.regions)
}

### Get number of peaks per signature combination and create a barplot.
#' Title
#'
#' @param i.regions
#' @param w.diffs
#'
#' @return
#' @export
#'
#' @examples
getSignatureCombCounts <- function(i.regions, w.diffs) {
  n.peaks <- lapply(names(i.regions), function(region.n) {
    n.peak1 <- sum(w.diffs[[region.n]] > 0)
    n.peak2 <- sum(w.diffs[[region.n]] < 0)
    n.peak <- c(n.peak1, n.peak2)
    n.peak <- melt(n.peak)
    n.peak$clusterId <- c("1", "2")
    n.peak$sigCombId <- region.n
    return(n.peak)
  })
  n.peaks <- do.call(rbind, n.peaks)
  return(n.peaks)
}


#' Normalize the signatures matrix (W)
#'
#' After column normalization of the matrix W, the inverse factors are
#' mutiplied with the rows of H in order to keep the matrix product W*H
#' constant.
#'
#' @param nmf.exp
#'
#' @return A data structure of type nmfExperiment
#'
#' @importFrom YAPSA normalize_df_per_dim
#' @export
#'
#' @examples
#'  NULL
#'
normalizeW <- function(nmf.exp){
  # account for WMatrixList and HMatrixList
  all_list <- lapply(seq_along(WMatrixList(nmf.exp)), function(k_ind){
    k_list <-
      lapply(seq_along(WMatrixList(nmf.exp)[[k_ind]]), function(init_ind){
        tempW <- WMatrixList(nmf.exp)[[k_ind]][[init_ind]]
        tempH <- HMatrixList(nmf.exp)[[k_ind]][[init_ind]]
        normFactor <- colSums(tempW)
        # catch errors associated with NaNs in W or H
        if (any(is.nan(normFactor))){
          return(list(W = tempW,
                      H = tempH))
        }else{
          newSigs <- as.matrix(normalize_df_per_dim(tempW, 2))
          newExpo <- tempH * normFactor
          #newV <- newSigs %*% newExpo
          #oldV <- tempW %*% tempH
          return(list(W = newSigs,
                      H = newExpo))
        }
      })
    names(k_list) <- names(WMatrixList(nmf.exp)[[k_ind]])
    return(k_list)
  })
  names(all_list) <- names(WMatrixList(nmf.exp))
  thisWMatrixList <- lapply(all_list, function(current_k_list){
    kWMatrixList <- lapply(current_k_list, function(current_entry){
      return(current_entry$W)
    })
  })
  nmf.exp <- setWMatrixList(nmf.exp, thisWMatrixList)
  thisHMatrixList <- lapply(all_list, function(current_k_list){
    kHMatrixList <- lapply(current_k_list, function(current_entry){
      return(current_entry$H)
    })
  })
  nmf.exp <- setHMatrixList(nmf.exp, thisHMatrixList)
  return(nmf.exp)
}

#' Normalize the signatures matrix (H)
#'
#' After row normalization of the matrix H, the inverse factors are
#' mutiplied with the columns of W in order to keep the matrix product W*H
#' constant.
#'
#' @param nmf.exp
#'
#' @return A data structure of type nmfExperiment
#'
#' @importFrom YAPSA normalize_df_per_dim
#' @export
#'
#' @examples
#'  NULL
#'
normalizeH <- function(nmf.exp){
  # account for WMatrixList and HMatrixList
  all_list <- lapply(seq_along(WMatrixList(nmf.exp)), function(k_ind){
    k_list <-
      lapply(seq_along(WMatrixList(nmf.exp)[[k_ind]]), function(init_ind){
        tempW <- WMatrixList(nmf.exp)[[k_ind]][[init_ind]]
        tempH <- HMatrixList(nmf.exp)[[k_ind]][[init_ind]]
        normFactor <- rowSums(tempH)
        newExpo <- as.matrix(normalize_df_per_dim(tempH, 1))
        newSigs <- tempW * normFactor
        return(list(W = newSigs,
                    H = newExpo))
      })
    names(k_list) <- names(WMatrixList(nmf.exp)[[k_ind]])
    return(k_list)
  })
  names(all_list) <- names(WMatrixList(nmf.exp))
  thisWMatrixList <- lapply(all_list, function(current_k_list){
    kWMatrixList <- lapply(current_k_list, function(current_entry){
      return(current_entry$W)
    })
  })
  nmf.exp <- setWMatrixList(nmf.exp, thisWMatrixList)
  thisHMatrixList <- lapply(all_list, function(current_k_list){
    kHMatrixList <- lapply(current_k_list, function(current_entry){
      return(current_entry$H)
    })
  })
  nmf.exp <- setHMatrixList(nmf.exp, thisHMatrixList)
  return(nmf.exp)
}

#' Regularize the signatures matrix (H)
#'
#' After row regularization of the matrix H, the inverse factors are
#' mutiplied with the columns of W in order to keep the matrix product W*H
#' constant.
#'
#' @param nmf.exp
#'
#' @return A data structure of type nmfExperiment
#'
#' @export
#'
#' @examples
#'  NULL
#'
regularizeH <- function(nmf.exp){
  # account for WMatrixList and HMatrixList
  all_list <- lapply(seq_along(WMatrixList(nmf.exp)), function(k_ind){
    k_list <-
      lapply(seq_along(WMatrixList(nmf.exp)[[k_ind]]), function(init_ind){
        tempW <- WMatrixList(nmf.exp)[[k_ind]][[init_ind]]
        tempH <- HMatrixList(nmf.exp)[[k_ind]][[init_ind]]
        normFactor <- rowMax(tempH)
        newExpo <- tempH / normFactor
        newSigs <- tempW * normFactor
        return(list(W = newSigs,
                    H = newExpo))
      })
    names(k_list) <- names(WMatrixList(nmf.exp)[[k_ind]])
    return(k_list)
  })
  names(all_list) <- names(WMatrixList(nmf.exp))
  thisWMatrixList <- lapply(all_list, function(current_k_list){
    kWMatrixList <- lapply(current_k_list, function(current_entry){
      return(current_entry$W)
    })
  })
  nmf.exp <- setWMatrixList(nmf.exp, thisWMatrixList)
  thisHMatrixList <- lapply(all_list, function(current_k_list){
    kHMatrixList <- lapply(current_k_list, function(current_entry){
      return(current_entry$H)
    })
  })
  nmf.exp <- setHMatrixList(nmf.exp, thisHMatrixList)
  return(nmf.exp)
}

#' Merge two objects of type nmfExperiment
#'
#' @param in_nmf1 First object of type nmfExperiment
#' @param in_nmf2 Second object of type nmfExperiment
#' @param rerunStats
#'  Boolean to indicate whether statistics should be computed on the merged
#'  object
#' @param verbose
#'
#' @return The merged object of type nmfExperiment
#' @export
#'
#' @examples
#'  NULL
#'
merge.nmf <- function(in_nmf1,
                      in_nmf2,
                      rerunStats = TRUE,
                      verbose = FALSE){
  error_msg <- "Bratwurst:::merge.nmf::error: input type mismatch.\n"
  ## check if the two stem from the same input
  if(all(names(in_nmf1) == names(in_nmf2)) &
     length(assays(in_nmf1)) == length(assays(in_nmf2))){
    if(verbose) cat("Bratwurst:::merge.nmf::verbose:Features of the two",
                    "input data structures match.\n")
    assayLogicVector <-
      unlist(lapply(seq_along(assays(in_nmf1)), function(current_ind){
        temp1 <- assays(in_nmf1)[[current_ind]]
        temp2 <- assays(in_nmf2)[[current_ind]]
        all(temp1 == temp2)
      }))
    if(all(assayLogicVector) &
       all(names(HMatrixList(in_nmf1)) == names(HMatrixList(in_nmf2))) &
       all(names(WMatrixList(in_nmf1)) == names(WMatrixList(in_nmf2)))){
      if(verbose) cat("Bratwurst:::merge.nmf::verbose:Assays of the two",
                      "input data structures match.\n")
      ## initialize
      if(verbose) cat("Bratwurst:::merge.nmf::verbose:Initialize.\n")
      merged.nmf.exp <- in_nmf1
      ## merge HMatrixList
      if(verbose) cat("Bratwurst:::merge.nmf::verbose:Concatenate",
                      "HMatrixList.\n")
      temp_list <-
        lapply(names(HMatrixList(in_nmf1)), function(currentRank){
          c(HMatrixList(in_nmf1, k = currentRank),
            HMatrixList(in_nmf2, k = currentRank))
        })
      names(temp_list) <- names(HMatrixList(in_nmf1))
      merged.nmf.exp <- setHMatrixList(merged.nmf.exp, temp_list)
      ## merge WMatrixList
      if(verbose) cat("Bratwurst:::merge.nmf::verbose:Concatenate",
                      "WMatrixList.\n")
      temp_list <-
        lapply(names(WMatrixList(in_nmf1)), function(currentRank){
          c(WMatrixList(in_nmf1, k = currentRank),
            WMatrixList(in_nmf2, k = currentRank))
        })
      names(temp_list) <- names(WMatrixList(in_nmf1))
      merged.nmf.exp <- setWMatrixList(merged.nmf.exp, temp_list)
      ## merge FrobError
      if(verbose) cat("Bratwurst:::merge.nmf::verbose:Concatenate",
                      "FrobError.\n")
      merged.nmf.exp@FrobError <- rbind(FrobError(in_nmf1), FrobError(in_nmf2))
      ## recalculate OptKStats if already available
      if(verbose) cat("Bratwurst:::merge.nmf::verbose:Recalculate error",
                      "statistics if necessary.\n")
      # if(sum(dim(OptKStats(in_nmf1))) + sum(dim(OptKStats(in_nmf2))) > 0){
      if(rerunStats){
        ## recalculate FrobErrorStats
        if(verbose) cat("Bratwurst:::merge.nmf::verbose:Recalculate",
                        "FrobErrorStats.\n")
        merged.nmf.exp <- computeFrobErrorStats(merged.nmf.exp)
        ## recalculate Alexandrov Criterion
        if(verbose) cat("Bratwurst:::merge.nmf::verbose:Recalculate",
                        "Alexandrov Criterion.\n")
        merged.nmf.exp <- computeSilhoutteWidth(merged.nmf.exp)
        # recalculate Cophenetic correlation coefficient
        if(verbose) cat("Bratwurst:::merge.nmf::verbose:Recalculate",
                        "Cophenetic correlation coefficient.\n")
        merged.nmf.exp <- computeCopheneticCoeff(merged.nmf.exp)
        # recalculate Amari type distance
        if(verbose) cat("Bratwurst:::merge.nmf::verbose:Recalculate",
                        "Amari type distance.\n")
        merged.nmf.exp <- computeAmariDistances(merged.nmf.exp)
      }
      return(merged.nmf.exp)
    } else {
      cat(error_msg)
      return(NULL)
    }
  } else {
    cat(error_msg)
    return(NULL)
  }
}

#' Compute signature specific features
#'
#' @param nmf.exp A NMF experiment object
#' @param rowDataId The index of the rowData(nmf.exp) data.frame that should be used for
#'  feature extraction. In case rowData(nmf.exp)[,1] is a GRanges or a related object like
#'  GenomicInteractions this parameter can be ignored
#'
#' @return nmf.exp with filles SignatureSpecificFeatures container
#'
#'
#' @export
#'
#' @examples
#'
computeSignatureSpecificFeatures <- function(nmf.exp, rowDataId = 3){
  if (length(OptK(nmf.exp)) == 0){
    stop("You need to first define an optimal k before being able to compute
           signature specific features!")
  } else {
    if (nrow(FeatureStats(nmf.exp)) == 0){
      message("Computing feature stats...")
      nmf.exp <- computeFeatureStats(nmf.exp)
    }
    fstats <- FeatureStats(nmf.exp)
    # identify unique cluster membership strings
    clusterMemberships <-
      sapply(unique(as.character(fstats$cluster)),
             function (x) lengths(regmatches(x, gregexpr("1", x))))
    sigSpecClusters <-
      sort(names(clusterMemberships[which(clusterMemberships == 1)]),
           decreasing = TRUE)

    if (class(rowData(nmf.exp)[, 1]) %in% c("GRanges", "GInteractions",
                                           "GenomicInteractions")){
      signatureSpecificFeatures <- lapply(sigSpecClusters, function(ssc){
        features <- rowData(nmf.exp)[, 1][which(fstats$cluster == ssc)]
        return(features)
      })
    }else{
      signatureSpecificFeatures <- lapply(sigSpecClusters, function(ssc){
        features <- rowData(nmf.exp)[, rowDataId][which(fstats$cluster == ssc)]
        return(features)
      })
    }
    names(signatureSpecificFeatures) <- sigSpecClusters
    nmf.exp@SignatureSpecificFeatures <- signatureSpecificFeatures
    return(nmf.exp)
  }
}
wurst-theke/bratwurst documentation built on May 17, 2019, 9:14 a.m.