R/cc_stats.R

Defines functions cc_stats

Documented in cc_stats

#' Provide statistics on the CCs size
#'
#' Provides the CC size distribution, which is the number of CCs including a
#' given number of protein members, and the proportion of single- vs
#' multi-protein CCs.
#' @param incM a \code{logical} \code{matrix} containing the incidence matrix
#' with its column and row names (respectively, protein and peptide identifiers)
#' names and 0 or 1 values indicating whether or not the peptide maps on the
#' corresponding protein.
#' @param cc.proteins a \code{list} of \code{vectors} (one for each connected
#' component), each enumerating the proteins members of a connected component.
#' @param reducedIncM a \code{logical} value indicating if the input matrix is
#' the original complete incidence matrix or a reduced version of it which only
#' contains proteins with at least one shared peptide and all peptides mapping
#' on such proteins (generated by the \code{\link{reduce_inc_matrix}} function.
#' @return a \code{list} of four outputs: i. a \code{vector} containing all
#' those proteins which belong to single-protein connected component (each
#' protein is a single-protein connected component); ii. an \code{integer}
#' representing the number of single-protein CCs; iii. an \code{integer}
#' representing the number of multi-protein CCs; iv. a \code{data.frame}
#' describing size distribution of connected components (number of connected
#' components per number of protein members)
#' @examples
#' # Read the tab-delimited file containing he proteome incidence matrix
#' incM_filename <- system.file("extdata"
#'                              , "incM_example"
#'                              , package = "net4pg"
#'                              , mustWork = TRUE)
#' rownames_filename <- system.file("extdata"
#'                                   , "peptideIDs_incM_example"
#'                                   , package = "net4pg"
#'                                   , mustWork = TRUE)
#' colnames_filename <- system.file("extdata"
#'                                  , "proteinIDs_incM_example"
#'                                  , package = "net4pg"
#'                                  , mustWork = TRUE)
#' incM <- read_inc_matrix(incM_filename = incM_filename
#'                  , colnames_filename = colnames_filename
#'                  , rownames_filename = rownames_filename)
#' # Only retain proteins with at least one shared peptide and all peptides
#' # mapping on such proteins.
#' incM_reduced <- reduce_inc_matrix(incM)
#' # Generate adjacency matrix describing protein-to-protein mappings
#' adjM <- get_adj_matrix(incM_reduced)
#' # Generate graph of protein-to-protein connections and calculate its
#' # connected components
#' multProteinCC <- get_cc(adjM)
#' # CCs size and percentage of single-vs multi-protein CCs
#' CCstatsOut <- cc_stats(incM = incM_reduced
#'                       , cc.proteins = multProteinCC$ccs
#'                       , reducedIncM = TRUE)
#'
#' @author Laura Fancello
#'
#' @export
#'

cc_stats <- function(incM, cc.proteins, reducedIncM) {

  # Sanity Checks  ----------------------------------------------------------
  ## Check input arguments
  if (is.null(incM)) {
    stop("argument 'incM' is missing, with no default")
  }
  if (!(methods::is(incM)[1] == "matrix")) {
    stop("argument 'incM' is not a matrix")
  }
  if (is.null(cc.proteins)) {
    stop("argument 'cc.proteins' is missing, with no default")
  }
  if (!(methods::is(cc.proteins)[1] == "list")) {
    stop("argument 'cc.proteins' is not a list")
  }
  if (is.null(reducedIncM)) {
    stop("argument 'reducedIncM' is missing, with no default")
  }
  if (!(methods::is(reducedIncM)[1] == "logical")) {
    stop("argument 'reducedIncM' is not a logical")
  }

  # Statistics on connected components size   --------------------------------
  ## Number of protein members for each multi-protein connected component
  N_proteins <- unlist(lapply(cc.proteins, length))
  N_prots_N_CCs <- as.data.frame(table(N_proteins))
  colnames(N_prots_N_CCs) <- c("N_proteins", "N_CCs")
  N_prots_N_CCs$N_proteins <- as.numeric(as.vector(N_prots_N_CCs$N_proteins))

  ## Number of single-protein connected components
  if (isTRUE(reducedIncM)) {

    `%notin%` <- Negate(`%in%`)
    indexes_singleProtCC <- which(colnames(incM) %notin% unlist(cc.proteins))
    N_singleProtCC <- length(indexes_singleProtCC)
    singleProtCC <- colnames(incM)[indexes_singleProtCC]
    N_multiProtCC <- length(cc.proteins)
    N_prots_N_CCs <- rbind(N_prots_N_CCs, c(1, N_singleProtCC))

  }else{
    N_singleProtCC <- sum(N_prots_N_CCs[N_prots_N_CCs$N_proteins == 1, ]$N_CCs)
    N_multiProtCC <- sum(N_prots_N_CCs[N_prots_N_CCs$N_proteins > 1, ]$N_CCs)
    N_prots_N_CCs <- rbind(N_prots_N_CCs, c(1, N_singleProtCC))
  }

  ## Number connected components with 1, 2, ..., 10, >10 proteins
  proteins <- as.data.frame(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
  colnames(proteins) <- "N_proteins"
  NprotsDistrib <- merge(proteins, N_prots_N_CCs
                                 , by = "N_proteins"
                                 , all.x = TRUE)
  NprotsDistrib[is.na(NprotsDistrib)] <- 0
  over10prot <- c(">10", sum(N_prots_N_CCs[N_prots_N_CCs$N_proteins > 10, ]$N_CCs))
  NprotsDistrib <- rbind(NprotsDistrib, over10prot)


  ## Clean memory
  rm(proteins, N_proteins, indexes_singleProtCC)
  gc()

  ## Return output
  return(list(singleProtCC = singleProtCC
              , N_singleProtCC = N_singleProtCC
              , N_multiProtCC = N_multiProtCC
              , NproteinsDistribution = NprotsDistrib))

}
laurafancello/CCs4prot documentation built on July 1, 2022, 1:34 p.m.