#' Provide statistics on the CCs size
#'
#' Provides the CC size distribution, which is the number of CCs including a
#' given number of protein members, and the proportion of single- vs
#' multi-protein CCs.
#' @param incM a \code{logical} \code{matrix} containing the incidence matrix
#' with its column and row names (respectively, protein and peptide identifiers)
#' names and 0 or 1 values indicating whether or not the peptide maps on the
#' corresponding protein.
#' @param cc.proteins a \code{list} of \code{vectors} (one for each connected
#' component), each enumerating the proteins members of a connected component.
#' @param reducedIncM a \code{logical} value indicating if the input matrix is
#' the original complete incidence matrix or a reduced version of it which only
#' contains proteins with at least one shared peptide and all peptides mapping
#' on such proteins (generated by the \code{\link{reduce_inc_matrix}} function.
#' @return a \code{list} of four outputs: i. a \code{vector} containing all
#' those proteins which belong to single-protein connected component (each
#' protein is a single-protein connected component); ii. an \code{integer}
#' representing the number of single-protein CCs; iii. an \code{integer}
#' representing the number of multi-protein CCs; iv. a \code{data.frame}
#' describing size distribution of connected components (number of connected
#' components per number of protein members)
#' @examples
#' # Read the tab-delimited file containing he proteome incidence matrix
#' incM_filename <- system.file("extdata"
#' , "incM_example"
#' , package = "net4pg"
#' , mustWork = TRUE)
#' rownames_filename <- system.file("extdata"
#' , "peptideIDs_incM_example"
#' , package = "net4pg"
#' , mustWork = TRUE)
#' colnames_filename <- system.file("extdata"
#' , "proteinIDs_incM_example"
#' , package = "net4pg"
#' , mustWork = TRUE)
#' incM <- read_inc_matrix(incM_filename = incM_filename
#' , colnames_filename = colnames_filename
#' , rownames_filename = rownames_filename)
#' # Only retain proteins with at least one shared peptide and all peptides
#' # mapping on such proteins.
#' incM_reduced <- reduce_inc_matrix(incM)
#' # Generate adjacency matrix describing protein-to-protein mappings
#' adjM <- get_adj_matrix(incM_reduced)
#' # Generate graph of protein-to-protein connections and calculate its
#' # connected components
#' multProteinCC <- get_cc(adjM)
#' # CCs size and percentage of single-vs multi-protein CCs
#' CCstatsOut <- cc_stats(incM = incM_reduced
#' , cc.proteins = multProteinCC$ccs
#' , reducedIncM = TRUE)
#'
#' @author Laura Fancello
#'
#' @export
#'
cc_stats <- function(incM, cc.proteins, reducedIncM) {
# Sanity Checks ----------------------------------------------------------
## Check input arguments
if (is.null(incM)) {
stop("argument 'incM' is missing, with no default")
}
if (!(methods::is(incM)[1] == "matrix")) {
stop("argument 'incM' is not a matrix")
}
if (is.null(cc.proteins)) {
stop("argument 'cc.proteins' is missing, with no default")
}
if (!(methods::is(cc.proteins)[1] == "list")) {
stop("argument 'cc.proteins' is not a list")
}
if (is.null(reducedIncM)) {
stop("argument 'reducedIncM' is missing, with no default")
}
if (!(methods::is(reducedIncM)[1] == "logical")) {
stop("argument 'reducedIncM' is not a logical")
}
# Statistics on connected components size --------------------------------
## Number of protein members for each multi-protein connected component
N_proteins <- unlist(lapply(cc.proteins, length))
N_prots_N_CCs <- as.data.frame(table(N_proteins))
colnames(N_prots_N_CCs) <- c("N_proteins", "N_CCs")
N_prots_N_CCs$N_proteins <- as.numeric(as.vector(N_prots_N_CCs$N_proteins))
## Number of single-protein connected components
if (isTRUE(reducedIncM)) {
`%notin%` <- Negate(`%in%`)
indexes_singleProtCC <- which(colnames(incM) %notin% unlist(cc.proteins))
N_singleProtCC <- length(indexes_singleProtCC)
singleProtCC <- colnames(incM)[indexes_singleProtCC]
N_multiProtCC <- length(cc.proteins)
N_prots_N_CCs <- rbind(N_prots_N_CCs, c(1, N_singleProtCC))
}else{
N_singleProtCC <- sum(N_prots_N_CCs[N_prots_N_CCs$N_proteins == 1, ]$N_CCs)
N_multiProtCC <- sum(N_prots_N_CCs[N_prots_N_CCs$N_proteins > 1, ]$N_CCs)
N_prots_N_CCs <- rbind(N_prots_N_CCs, c(1, N_singleProtCC))
}
## Number connected components with 1, 2, ..., 10, >10 proteins
proteins <- as.data.frame(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
colnames(proteins) <- "N_proteins"
NprotsDistrib <- merge(proteins, N_prots_N_CCs
, by = "N_proteins"
, all.x = TRUE)
NprotsDistrib[is.na(NprotsDistrib)] <- 0
over10prot <- c(">10", sum(N_prots_N_CCs[N_prots_N_CCs$N_proteins > 10, ]$N_CCs))
NprotsDistrib <- rbind(NprotsDistrib, over10prot)
## Clean memory
rm(proteins, N_proteins, indexes_singleProtCC)
gc()
## Return output
return(list(singleProtCC = singleProtCC
, N_singleProtCC = N_singleProtCC
, N_multiProtCC = N_multiProtCC
, NproteinsDistribution = NprotsDistrib))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.