BioCor: Functional similarities

Documented in combinadic duplicateIndices inverseList removeDup seq2mat

# combinadic ####
#' i-th combination of n elements taken from r
#'
#' Function similar to combn but for larger vectors. To avoid allocating a big
#' vector with all the combinations each one can be computed with this
#' function.
#' @param n Elements to extract the combination from
#' @param r Number of elements per combination
#' @param i ith combination
#' @return The combination ith of the elements
#' @rdname combinadic
#' @seealso [combn()]
#' @export
#' @examples
#' # Output of all combinations
#' combn(LETTERS[1:5], 2)
#' # Otuput of the second combination
#' combinadic(LETTERS[1:5], 2, 2)
#' @author Joshua Ulrich
#' @references
#' [StackOverflow answer
#' 4494469/2886003](http://stackoverflow.com/a/4494469/2886003)
combinadic <- function(n, r, i) {

  # http://msdn.microsoft.com/en-us/library/aa289166(VS.71).aspx
  # http://en.wikipedia.org/wiki/Combinadic
  n0 <- length(n)
  if (i < 1L || i > choose(n0, r)) {
    stop("'i' must be 0 < i <= n0!/(n0-r)!")
  }
  largestV <- function(n, r, i) {
    v <- n # Adjusted for one-based indexing
    while (choose(v, r) >= i) { # Adjusted for one-based indexing
      v <- v - 1L
    }
    return(v)
  }

  res <- rep(NA, r)
  for (j in 1L:r) {
    res[j] <- largestV(n0, r, i)
    i <- i - choose(res[j], r)
    n0 <- res[j]
    r <- r - 1L
  }
  res <- res + 1L
  res <- n[res]
  return(res)
}

# seq2mat ####
#' Transforms a vector to a symmetric matrix
#'
#' Fills a matrix of `ncol = length(x)` and `nrow = length(x)` with
#' the values in `dat` and setting the diagonal to 1.
#'
#' `dat` should be at least `choose(length(x), 2)` of length. It
#' assumes that the data provided comes from using the row and column id to
#' obtain it.
#' @param x names of columns and rows, used to define the size of the matrix
#' @param dat Data to fill with the matrix with except the diagonal.
#' @return A square matrix with the diagonal set to 1 and `dat` on the
#' upper and lower triangle with the columns ids and row ids from x.
#' @examples
#' seq2mat(LETTERS[1:5], 1:10)
#' seq2mat(LETTERS[1:5], seq(from = 0.1, to = 1, by = 0.1))
#' @export
#' @seealso [upper.tri()] and [lower.tri()]
#' @author Lluís Revilla
seq2mat <- function(x, dat) {
  if (length(dat) != choose(length(x), 2L)) {
    stop("Data is not enough big to populate the matrix")
  }
  out <- matrix(ncol = length(x), nrow = length(x))
  out[upper.tri(out)] <- unlist(dat, use.names = TRUE)
  out[lower.tri(out)] <- t(out)[lower.tri(t(out))]
  diag(out) <- 1L
  rownames(out) <- colnames(out) <- x
  return(out)
}

# duplicateIndices ####
#' Finds the indices of the duplicated events of a vector
#'
#' Finds the indices of duplicated elements in the vector given.
#'
#' For each duplication it can return a list or if all the duplication events
#' are of the same length it returns a matrix, where each column is duplicated.
#' @param vec Vector of identifiers presumably duplicated
#' @return The format is determined by the simplify2array
#' @export
#' @author Lluís Revilla
#' @seealso [removeDup()]
#' @examples
#' duplicateIndices(c("52", "52", "53", "55")) # One repeated element
#' duplicateIndices(c("52", "52", "53", "55", "55")) # Repeated elements
#' duplicateIndices(c("52", "55", "53", "55", "52")) # Mixed repeated elements
duplicateIndices <- function(vec) {
  if (!is.character(vec)) {
    stop("Expected a list of characters to find duplicates on it")
  }
  sapply(unique(vec[duplicated(vec)]), function(x) {
    b <- seq_len(length(vec))
    b[vec == x]
  }, simplify = FALSE)
}

# removeDup ####
#' Remove duplicated rows and columns
#'
#' Given the indices of the duplicated entries remove the columns and rows
#' until just one is left, it keeps the duplicated with the highest absolute
#' mean value.
#'
#' @param cor_mat List of matrices
#' @param dupli List of indices with duplicated entries
#' @return A matrix with only one of the columns and rows duplicated
#' @export
#' @author Lluís Revilla
#' @seealso [duplicateIndices()] to obtain the list of indices with
#' duplicated entries.
#' @examples
#' a <- seq2mat(c("52", "52", "53", "55"), runif(choose(4, 2)))
#' b <- seq2mat(c("52", "52", "53", "55"), runif(choose(4, 2)))
#' mat <- list("kegg" = a, "react" = b)
#' mat
#' dupli <- duplicateIndices(rownames(a))
#' remat <- removeDup(mat, dupli)
#' remat
removeDup <- function(cor_mat, dupli) {
  if (!all(sapply(cor_mat, isSymmetric))) {
    stop(
      "All the matrices of mat should be symmetric and with the same ",
      "column names and rownames"
    )
  }
  cor_mat <- Map(function(mat, x = dupli) {
    rem.colum <- sapply(x, function(y, m) {
      mean.column <- apply(m[, y], 2L, mean, na.rm = TRUE)
      i <- which.max(abs(mean.column))
      # Select those who don't bring more information
      setdiff(y, y[i])
    }, m = mat)

    mat[-rem.colum, -rem.colum]
  }, cor_mat)
  return(cor_mat)
}

#' Invert a list
#'
#' Calculate the pathways per gene of list
#'
#' @param x A list with genes as names and names of pathways as values of the
#' list
#' @return The number of pathways each gene has.
#' @author Lluís Revilla
#' @export
inverseList <- function(x) {
  stopifnot(length(names(x)) == length(x))
  stopifnot(all(sapply(x, function(x) {
    is.character(x) || is.na(x)
  })))
  genes <- unlist(x, use.names = FALSE)
  pathways <- rep(names(x), lengths(x))
  split(pathways, genes)
}