R/Proteins-methods.R
In Pbase: Manipulating and exploring protein and proteomics data

Documented in aa acols avarLabels pcols pfeatures pranges pvarLabels

setMethod("Proteins", c("missing", "missing"),
          function(file, uniprotIds, ...){
              aa <- new("AAStringSet")
              aa@elementMetadata <- DataFrame()
              new("Proteins",
                  metadata = list(created = date()),
                  aa = aa)
          })

setMethod("Proteins",
          signature(file = "character", uniprotIds = "missing"),
          function(file, uniprotIds, ...) {
              aa <- .readAAStringSet(file, ...)
              metadata <- list(created = date())
              new("Proteins", metadata = metadata, aa = aa)
          })

setMethod("Proteins",
          signature(file = "missing", uniprotIds = "character"),
          function(file, uniprotIds, ...) {
              .toBeImplemented()
          })

############################################################
## Proteins for EnsDb, character, use uniprotIds in an UniprotidFilter.
## Proteins for EnsDb, missing, ... -> check if we've got filter specified,
##  otherwise just return everything.
## Add also an optional argument loadDomains = TRUE -> load the protein domains.

############################################################
## Proteins, EnsDb, missing
##
##' @title Fetch a Proteins object from an EnsDb database
##'
##' @description Fetch a \code{Proteins} object from an
##'     \code{\linkS4class[ensembldb]{EnsDb}} database.
##' 
##' @param file EnsDb database from which protein data should/can be
##'     retrieved.
##'
##' @param uniprotIds missing.
##'
##' @param loadProteinDomains \code{logical(1)} defining whether
##'     protein domains within the proteins' sequences should be
##'     loaded too. If \code{TRUE}, protein domains are loaded and
##'     added a \code{pranges} named \textit{ProteinDomains}.
##'
##' @param filter An object extending
##'     \code{\linkS4Object[AnnotationFilter]{AnnotationFilter}} an
##'     \code{\link[AnnotationFilter]{AnnotationFilterList}} object combining
##'     such objects or a \code{formula} representing a filter expression. See
##'     documentation on \code{\link[ensembldb]{proteins}} for more details and
##'     examples.
##'
##' @param columns Character vector specifying the columns that should be
##'     returned from the database. By default, all columns from the
##'     \emph{protein} database table are returned. Use the
##'     \code{\link[ensembldb]{listColumns}} function to get a list of all
##'     supported columns. Note that exon-related columns are not supported for
##'     the \code{Proteins} method.
##'
##' @param fetchLRG Logical indicating whether proteins for Locus Reference
##'     Genes (LRG) should be retrieved too. By default LRG genes will not be
##'     fetched as they do have a 1:n mapping between transcripts and proteins.
##'
##' @param ... Additional arguments to be passed to the
##'     \code{\link[ensembldb]{proteins}} method that is used to fetch the data.
##'
##' @noRd
setMethod("Proteins",
          signature(file = "EnsDb", uniprotIds = "missing"),
          function(file, uniprotIds, loadProteinDomains = TRUE,
                   filter = AnnotationFilterList(), columns = NULL,
                   fetchLRG = FALSE, ...) {
              ## Get the data from EnsDb.
              if (!hasProteinData(file))
                  stop("The provided 'EnsDb' does not contain protein annotations!")
              ## Check 'columns':
              ## o add all columns from the protein table.
              ## o ensure we have no column from the exon or tx2exon table.
              ## o load, if necessary, columns from the protein_domain table.
              columns <- unique(c(columns, listColumns(file, "protein")))
              not_allowed <- listColumns(file, c("exon", "tx2exon"))
              not_allowed <- not_allowed[not_allowed != "tx_id"]
              if (any(columns %in% not_allowed)) {
                  warning("For 'Proteins', fetching exon-related columns is not",
                          " allowed! Columns ",
                          paste0("'", columns[columns %in% not_allowed], "'",
                                 collapse = ", "), " have been removed.")
                  columns <- columns[!(columns %in% not_allowed)]
              }
              if (loadProteinDomains) {
                  columns <- unique(c(columns,
                                      listColumns(file, "protein_domain")))
              }
              if (!fetchLRG) {
                  if (is(filter, "formula"))
                      filter <- AnnotationFilter(filter)
                  ## Add a filter to fetch only transcripts starting with ENS
                  filter <- AnnotationFilterList(
                      TxIdFilter("ENS", condition = "startsWith"),
                      filter)
              }
              ## Now fetch the data:
              res <- ensembldb::proteins(file, filter = filter,
                                         columns = columns,
                                         return.type = "data.frame")
              if (nrow(res) == 0)
                  return(Proteins())
              ## Get the unique data for protein:
              ## For now we're removing the Uniprot IDs. Problem is the 1:n
              ## mapping from protein_id to uniprot_id. Solutions:
              ## 1) return a redundant list of proteins, i.e. replicate the
              ##    proteins, one for each Uniprot ID.
              ## 2) return a unique list of proteins adding the Uniprot IDs as
              ##    a list to the mcols of the AAStringSet (i.e. acols).
              pids <- unique(res$protein_id)
              prt_cn <- colnames(res)[!(colnames(res) %in%
                                        listColumns(file, c("protein_domain")))]
              prt <- unique(res[, c("protein_id", prt_cn)])
              aa <- AAStringSet(prt$protein_sequence)
              names(aa) <- prt$protein_id
              mcols(aa) <- prt[, !colnames(prt) %in% c("protein_id",
                                                       "protein_sequence"),
                               drop = FALSE]
              ## Create the Proteins object.
              pr <- new("Proteins",
                        aa = aa,
                        metadata = list(created = date(),
                                        ensembl_version = ensemblVersion(file)))
              ## Fetch protein domain data
              if (loadProteinDomains) {
                  prng <- IRangesList(list())
                  prt_dom_cols <- listColumns(file, "protein_domain")
                  prt_dom_mcols <- prt_dom_cols[!(prt_dom_cols %in%
                                                  c("protein_domain_id",
                                                    "prot_dom_start",
                                                    "prot_dom_end"))]
                  prt_dom <- unique(res[, prt_dom_cols, drop = FALSE])
                  ## Remove empty ones
                  prt_dom <- prt_dom[!is.na(prt_dom$prot_dom_start), ,
                                     drop = FALSE]
                  if (nrow(prt_dom) > 0) {
                      prng <- IRanges(start = prt_dom$prot_dom_start,
                                      end = prt_dom$prot_dom_end)
                      names(prng) <- prt_dom$protein_domain_id
                      mcols(prng) <- prt_dom[, prt_dom_mcols]
                      prng <- split(prng,
                                    f = factor(prt_dom$protein_id,
                                               levels = unique(prt_dom$protein_id)))
                  }
                  ## Fill empty ones.
                  if (!all(pids %in% prt_dom$protein_id)) {
                      miss_prt <- pids[!(pids %in% prt_dom$protein_id)]
                      tmp <- replicate(length(miss_prt),
                                       .emptyProtDomIRanges(cols = prt_dom_mcols))
                      names(tmp) <- miss_prt
                      prng <- c(prng, IRangesList(tmp))
                  }
                  ## Ensure correct ordering and ensure correct mapping e.g.
                  ## if we have 1:n mapping for protein_id to uniprot_id.
                  ## issue #35
                  prng <- prng[names(aa)]
                  mcols(pr@aa)$ProteinDomains <- prng
              }
              if (validObject(pr))
                  return(pr)
          })

## Simple helper function to create an IRanges with mcols.
.emptyProtDomIRanges <- function(cols) {
    res <- IRanges()
    tmp <- DataFrame(matrix(ncol = length(cols), nrow = 0))
    colnames(tmp) <- cols
    mcols(res) <- tmp
    return(res)
}

setMethod("[", "Proteins",
          function(x, i, j = "missing", ..., drop) {
              if (!missing(j) || length(list(...)) > 0L)
                  stop("invalid subsetting")
              if (missing(i) || (is.logical(i) && all(i)))
                  return(x)
              if (is.logical(i))
                  i <- which(i)
              if (is.character(i))
                  i <- match(i, seqnames(x))
              if (!is.numeric(i) || any(is.na(i)))
                  stop("invalid subsetting")
              if (any(i < 1) || any(i > length(x)))
                  stop("subscript out of bounds")
              x@aa <- x@aa[i]
              x
          })


## accessors
## setMethod("pfeatures", "Proteins",
##           function(x) extractAt(aa(x), unname(pranges(x))))

pfeatures <- function(x, pcol) {
    pcol <- .checkPcol(x, pcol)
    ## stopifnot(pcol %in% pvarLabels(x))
    extractAt(aa(x), unname(pranges(x)[[pcol]]))
}

## setMethod("pranges", "Proteins", function(x) x@pranges)
pranges <- function(x) {
    i <- .get_pranges_indices(x)
    mcols(x@aa)[i]
}

## setReplaceMethod("pranges",
##                  c("Proteins", "CompressedIRangesList"),
##                  function(object, value)
##                      replacePranges(object, value))

setReplaceMethod("acols",
                 c("Proteins", "DataFrame"),
                 function(object, value)
                     replaceAcols(object, value))

setMethod("length", "Proteins",
          function(x) length(x@aa))

setMethod("metadata", "Proteins",
          function(x) x@metadata)

## signature is (x, ..., value)
setReplaceMethod("metadata", "Proteins",
                 function(x, name, value) {
                     if (name == "created")
                         stop("Creation date can't be modified.")
                   x@metadata[[name]] <- value
                   x
               })

pcols <- function(x) {
    i <- .get_pranges_indices(x)
    mcols(x@aa)[i]
}

acols <- function(x) {
    i <- .get_pranges_indices(x)
    mcols(x@aa)[!i]
}



setMethod("seqnames","Proteins",
          function(x) names(aa(x)))

setMethod("names","Proteins",
          function(x) seqnames(x))

avarLabels <- function(x) {
    i <- .get_pranges_indices(x)
    names(mcols(x@aa))[!i]
}

pvarLabels <- function(x) {
    i <- .get_pranges_indices(x)
    names(mcols(x@aa))[i]
}

setMethod("[[", "Proteins",
          function(x, i, j = missing, ..., drop = TRUE) x@aa[[i]])

aa <- function(x) x@aa

## Methods
setMethod("addIdentificationData",
          c("Proteins", "character"),
          function(object, id, rmEmptyRanges = TRUE, par = Pparams()) {
              if ("Peptides" %in% names(object@aa@elementMetadata)) {
                  warning("Peptides pranges already present. Keeping as is.")
                  return(object)
              }
              .addIdentificationDataProteins(object, filenames = id,
                                             rmEmptyRanges = rmEmptyRanges,
                                             par = par)
          })

setMethod("addPeptideFragments",
          c("Proteins", "character"),
          function(object, filenames, rmEmptyRanges = TRUE, par = Pparams()) {
              .addPeptideFragmentsProteins(object, filenames = filenames,
                                           rmEmptyRanges = rmEmptyRanges,
                                           par = par)
          })

setMethod("cleave", "Proteins",
          function(x, enzym = "trypsin", missedCleavages = 0, ...) {
              nm <- paste0(enzym, "Cleaved")
              if (nm %in% avarLabels(x)) {
                  message(nm, " already exists. Leaving object as is.")
              } else {
                  rng <- cleavageRanges(x = x@aa, enzym = enzym,
                                        missedCleavages = missedCleavages,
                                        ...)                  
                  mcols(x@aa)[, nm] <- IRangesList(lapply(rng, function(r) {
                      mc <- missedCleavages[cumsum(start(r) == 1L)]
                      mcols(r) <- DataFrame(MissedCleavages = Rle(mc))
                      r
                  }))
              }
              x
          })

setMethod("plot",
          signature(x = "Proteins", y = "missing"),
          function(x, y, ...) .plotProteins(x, ...))

setMethod("pfilter",
          "Proteins",
          function(x, mass = NULL, len = NULL, ...) {
              .pfilterProteins(x, mass = mass, len = len, ...)
          })

setMethod("show", "Proteins",
          function (object) {
              topics <- c("S4 class type", "Class version", "Created",
                          "Number of Proteins")
              topics <- format(topics, justify = "left")
              n <- length(object)
              values <- c(class(object),
                          tail(as(classVersion(object), "character"), 1L),
                          object@metadata$created, n)
              cat(paste0(topics, ": ", values, collapse = "\n"), sep = "\n")
              if (length(object) > 0) {
                  sn <- seqnames(object)
                  ln <- length(object)
                  cat("Sequences:\n  ")
                  htcat(seqnames(object), n = 2)
                  cat("Protein ranges:")
                  pvl <- pvarLabels(object)
                  if (length(pvl)) {
                      cat("\n  ")
                      htcat(pvarLabels(object), n = 2)
                  } else cat(" None\n")
              }
          })


## New suggested methods:
## + width: return the width(x@aa)
## + names,Proteins: returns seqnames,Proteins
Any scripts or data that you put into this service are public.
Pbase documentation built on Oct. 31, 2019, 2:20 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Pbase
Manipulating and exploring protein and proteomics data

R/Proteins-methods.R
In Pbase: Manipulating and exploring protein and proteomics data

Defines functions .emptyProtDomIRanges pfeatures pranges pcols acols avarLabels pvarLabels aa

Documented in aa acols avarLabels pcols pfeatures pranges pvarLabels

Try the Pbase package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Pbase Manipulating and exploring protein and proteomics data

R/Proteins-methods.R In Pbase: Manipulating and exploring protein and proteomics data

Defines functions .emptyProtDomIRanges pfeatures pranges pcols acols avarLabels pvarLabels aa

Documented in aa acols avarLabels pcols pfeatures pranges pvarLabels

Try the Pbase package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

Pbase
Manipulating and exploring protein and proteomics data

R/Proteins-methods.R
In Pbase: Manipulating and exploring protein and proteomics data