BioCor: Functional similarities

Documented in combineScores combineScoresPar

# combineScores ####
#' Combining values
#'
#' Combine several similarities into one using several methods.
#'
#' The input matrix can be a base matrix or a matrix from package Matrix.
#' The methods return:
#'  - **avg**: The average or mean value.
#'  - **max**: The max value.
#'  - **rcmax**: The max of the column means or row means.
#'  - **rcmax.avg**: The sum of the max values by rows and columns divided by
#' the number of columns and rows.
#'  - **BMA**: The same as `rcmax.avg`.
#'  - **reciprocal**: The double of the sum of the reciprocal maximal
#' similarities (above  a threshold) divided by the number of elements.
#' See equation 3 of the Tao *et al* 2007 article.
#' @param scores Matrix of scores to be combined
#' @param method one of `c("avg", "max", "rcmax", "rcmax.avg", "BMA",
#' "reciprocal")`, see Details.
#' @param round Should the resulting value be rounded to the third digit?
#' @param t Numeric value to filter scores below this value. Only used in the
#' reciprocal method.
#' @return A numeric value as described in details.
#' @note `combineScores` is a version of the function of the same name in
#' package GOSemSim ([GOSemSim::combineScores()]) with optional
#' rounding and some internal differences.
#' @export
#' @author Lluís Revilla based on Guangchuang Yu.
#' @references
#' Ying Tao, Lee Sam, Jianrong Li, Carol Friedman, Yves A. Lussier;
#' Information theory applied to the sparse gene ontology annotation network to
#' predict novel gene function. Bioinformatics 2007; 23 (13): i529-i538.
#' doi: 10.1093/bioinformatics/btm195
#' @examples
#' (d <- structure(c(
#'     0.4, 0.6, 0.222222222222222, 0.4, 0.4, 0, 0.25, 0.5,
#'     0.285714285714286
#' ),
#' .Dim = c(3L, 3L),
#' .Dimnames = list(c("a", "b", "c"), c("d", "e", "f"))
#' ))
#' e <- d
#' sapply(c("avg", "max", "rcmax", "rcmax.avg", "BMA", "reciprocal"),
#'     combineScores,
#'     scores = d
#' )
#' d[1, 2] <- NA
#' sapply(c("avg", "max", "rcmax", "rcmax.avg", "BMA", "reciprocal"),
#'     combineScores,
#'     scores = d
#' )
combineScores <- function(scores, method = c(
  "max", "avg", "rcmax", "rcmax.avg", "BMA",
  "reciprocal"
), round = FALSE, t = 0) {
  # Check input
  method <- match.arg(
    method,
    c(
      "avg", "max", "rcmax", "rcmax.avg", "BMA",
      "reciprocal"
    )
  )


  if (is.list(scores) && length(scores) == 1) {
    scores <- scores[[1]]
  }

  if (!is.Matrix(scores)) {
    stop("scores argument should be a matrix")
  }
  if (length(round) != 1 || !is.logical(round)) {
    stop("round argument is not logical")
  }

  if (any(dim(scores) == 0L)) {
    return(NA)
  }

  if (!is.numeric(t) || t < 0 || t > 1) {
    stop("t is for the reciprocal method, and should be between 0 and 1")
  }

  scores <- removeNA(scores)

  # Apply the methods
  if (method == "avg") {
    result <- mean(scores, na.rm = TRUE)
  } else if (method == "max") {
    result <- max(scores, na.rm = TRUE)
  } else if (is.Matrix(scores)) {
    if (method == "rcmax") {
      result <- rcmax(scores)
    } else if (method == "rcmax.avg" || method == "BMA") {
      result <- BMA(scores)
    } else if (method == "reciprocal") {
      result <- reciprocal(scores, t)
    }
  } else {
    warning(
      "Using max method because after removing NAs ",
      "there isn't a matrix to use."
    )
    result <- max(scores, na.rm = TRUE)
  }

  # Return the value
  rounder(result, round)
}

rounder <- function(result, round) {
  if (round) {
    return(round(result, digits = 3))
  } else {
    return(result)
  }
}
removeNA <- function(m) {
  # Remove NA
  if (any(is.na(m)) && is.Matrix(m)) {
    row.na.idx <- apply(m, 1, function(i) {
      all(is.na(i))
    })
    col.na.idx <- apply(m, 2, function(i) {
      all(is.na(i))
    })
    if (any(row.na.idx)) {
      m <- m[-which(row.na.idx), , drop = FALSE]
    }
    if (any(col.na.idx)) {
      m <- m[, -which(col.na.idx)]
    }
  }
  m
}

# Check Matrix classes and matrix
#' @importFrom methods is
is.Matrix <- function(m) {
  is.matrix(m) || is(m, "Matrix")
}

BMA <- function(scores) {
  if (length(dim(scores)) != 2) {
    stop("scores must be a matrix.")
  }
  sum(
    apply(scores, 1, max, na.rm = TRUE),
    apply(scores, 2, max, na.rm = TRUE)
  ) / sum(
    dim(scores)
  )
}
rcmax <- function(scores) {
  if (length(dim(scores)) != 2) {
    stop("scores must be a matrix.")
  }
  col <- mean(apply(scores, 1, max, na.rm = TRUE))
  row <- mean(apply(scores, 2, max, na.rm = TRUE))
  max(col, row, na.rm = TRUE)
}

reciprocal <- function(scores, t) {
  if (t < 0 || t > 1) {
    stop("t must be between 1 and 0")
  }
  if (length(dim(scores)) != 2) {
    stop("scores must be a matrix.")
  }
  # Find the ones that max row is also the max col
  fmax <- function(x) {
    x == max(x, na.rm = TRUE)
  }
  # Max in rows
  # Transpose to correct for the fact that apply fill by column
  rowsMax <- which(t(apply(scores, 1, fmax)))
  # Max in columns
  colsMax <- which(apply(scores, 2, fmax))
  rScores <- scores[intersect(rowsMax, colsMax)]
  # In case of a draw select the first 4
  # (it wouldn't matter as it should be all 1)
  rScores <- rScores[seq_len(min(ncol(scores), nrow(scores)))]
  rScores <- rScores[!is.na(rScores)]

  if (all(rScores < t)) {
    NA
  } else {
    2 * sum(rScores[rScores >= t]) / (sum(dim(scores)))
  }
}

#' `cobmineScoresPar` performs multiple `combineScores`. It allows
#' parallel computing using BiocParallel back-end.
#' @param subSets List of combinations as info in other functions.
#' @param BPPARAM BiocParallel back-end parameters.
#' By default (`NULL`) a `for` loop is used.
#' @param ... Other arguments passed to `combineScores`
#' @seealso [register][BiocParallel::register] in BiocParallel about the arguments
#' accepted by BPPARAM.
#' @rdname combineScores
#' @export
#' @import Matrix
#' @import BiocParallel
#' @importFrom utils combn
#' @examples
#' colnames(e) <- rownames(e)
#' combineScoresPar(e, list(a = c("a", "b"), b = c("b", "c")),
#'     method = "max"
#' )
combineScoresPar <- function(scores,
                             method,
                             subSets = NULL,
                             BPPARAM = NULL,
                             ...) {

  # Check scores
  if (is.null(subSets) || sum(dim(scores)) == 0) {
    return(combineScores(scores, method = method, ...))
    # To handle cases when it is already simplified
  } else if (length(subSets) == 1) {
    return(combineScores(scores, method = method, ...))
  } else {
    # To handle cases where subSets are not present in scores
    B <- matrix(NA,
                ncol = length(subSets), nrow = length(subSets),
                dimnames = list(names(subSets), names(subSets))
    )
    subSets <- keepSubSet(subSets, scores)
    if (length(subSets) == 0) {
      return(B)
    }
  }

  # all combinations of indices
  ij <- combn(seq_along(subSets), 2) # Use fun

  # add all i = j to the combination of indices
  ij <- matrix(c(ij, rep(seq_along(subSets), each = 2)), nrow = 2)

  if (is.null(BPPARAM)) { # If not a parallel background is provided
    # only one loop
    res <- numeric(ncol(ij)) # preallocate
    for (k in seq_len(ncol(ij))) {
      rowIds <- subSets[[ij[1, k]]]
      colIds <- subSets[[ij[2, k]]]
      if (anyNA(c(rowIds, colIds)) || is.null(colIds) || is.null(rowIds)) {
        res[k] <- NA
      } else {
        res[k] <- combineScores(scores[rowIds, colIds, drop = FALSE],
                                method,
                                ... = ...
        )
      }
    }
  } else {
    # Use the parallel background provided
    res <- bplapply(seq_len(ncol(ij)), function(x) {
      if (ncol(ij) < x) {
        message(print(ij))
      }
      rowIds <- subSets[[ij[1, x]]]
      colIds <- subSets[[ij[2, x]]]
      if (anyNA(c(rowIds, colIds)) || is.null(rowIds) || is.null(colIds)) {
        NA
      } else {
        combineScores(scores[rowIds, colIds, drop = FALSE],
                      method,
                      ... = ...
        )
      }
    }, BPPARAM = BPPARAM)
    res <- as.numeric(res)
  }

  A <- sparseMatrix(
    i = ij[1, ], j = ij[2, ],
    x = res, dims = rep(length(subSets), 2),
    symmetric = TRUE, index1 = TRUE,
    dimnames = list(names(subSets), names(subSets))
  )
  AintoB(as.matrix(A), B)
}


keepSubSet <- function(subSets, scores) {
  # Check the ids
  subId <- unique(unlist(subSets, use.names = FALSE))
  values <- unlist(dimnames(scores), use.names = FALSE)

  if (anyNA(subId)) {
    keep <- sapply(subSets, check_in, values = values)
    subSets[keep]
  } else {
    subSets
  }
}

check_in <- function(x, values) {
  if (all(is.na(x) | is.null(x))) {
    FALSE
  } else {
    all(x %in% values)
  }
}