#' Protein inference
#'
#' The \code{infer_prot} function finds the smallest set of UniProt accessions
#' (\code{UniProtAcc}) that maps to the largest number of amino acid sequences
#' (\code{cleanSeq}).
#'
#' @param x A \code{data.table} output from the \code{control_fdr} function.
#'
#' @return A \code{data.table} with all rows removed corresponding to the
#' UniProt accessions that do not belong to the set of accessions that map to
#' the largest number of \code{cleanSeq}s. The output from this function needs
#' to be kept for the rest of the \code{TopPICR} workflow as it will be used
#' by later functions.
#'
#' @import data.table
#'
#' @author Vlad Petyuk
#'
#' @export
#'
infer_prot <- function (x) {
# Keep three columns of the unique rows of x.
y <- unique(x[,c("cleanSeq","UniProtAcc","AnnType")])
y$AnnType <- ordered(y$AnnType, levels = c("SwissProt","VarSplic","TrEMBL"))
# Convert y to a data table.
setDT(y)
# Order the data table by annotation, accession, and then cleanSeq. This order
# will be carried through the call of the inference function.
setorder(y, AnnType, UniProtAcc, cleanSeq)
# Selects a set of accessions which maps to the most sequences. This is a
# different method from accessions_seq which first selects sequences that only
# map to one accession and then finds all sequences that those accessions map
# to.
res <- inference(y) # Occam's razor?
# Compare x and res row-wise. Keep any row (all columns) in x that matches any
# row in res. The row-wise comparison will be done with all columns that are
# found in both x and res (with res being the limiting factor).
x <- dplyr::semi_join(x, res)
return (x)
}
# infer_prot auxiliary functions -------------------------------------------------
# @author Vlad Petyuk
inference <- function (x) {
res <- list()
# Continue to run until there are no rows in x. At the end of the while loop
# the number of rows in x will be reduced.
while (nrow(x) > 0) {
# First []: Counts the number of times each accession appears in x and
# creates a column that contains the count for each UniProt accession.
# Second []: Finds the row with the accession that has the highest count
# (occurs the most number of times in x). $UniProtAcc: Extracts the UniProt
# accession name with the highest count.
top_prot <- x[, .N, by = UniProtAcc][which.max(N), , ]$UniProtAcc
# Extracts the rows in x whose UniProtAcc value matches top_prot.
top_peps <- subset(x, UniProtAcc == top_prot)
# Add the data frame (top_peps which is a subset of x) to the res list.
res <- c(res, list(top_peps))
# Remove any rows in x whose cleanSeq value matches any of the sequences in
# the top_peps data frame. In other words, any rows in x that contain the
# sequences mapping to the accession with the highest count are removed.
x <- subset(x, !(cleanSeq %in% top_peps[[1]]))
}
return(rbindlist(res,
use.names = FALSE,
fill = FALSE,
idcol = NULL))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.