R/helper-functions.R

Defines functions mapMEROPSIDs matchTermini getSeqData

##***********************************************************************
##
##     proteasy helper functions for internal use
##
##***********************************************************************

#########################################################################
###
### Retrieve sequence data
##

getSeqData <- function(method, protein, organism) {

    if(method == "Rcpi") {

        # Rcpi method
        p <- Rcpi::getSeqFromUniProt( unique(protein), parallel = 5)

        p <- data.table::as.data.table(t(
            vapply(p, FUN.VALUE = data.table::data.table("x", "y"),
                   FUN = function(x) data.table::data.table(seq_name =
                as.character(
                    sub("\\|.*", "",
                           sub(".*\\|(.*)\\|.*", "\\1",
                               names(x)))), sequence = as.character(x[[1]]))
            )))

    } else if(method == "ensembldb") {

        # ensembldb method
        p <- switch(organism,
            "Homo sapiens" = EnsDb.Hsapiens.v86::EnsDb.Hsapiens.v86,
            "Mus musculus" = EnsDb.Mmusculus.v79::EnsDb.Mmusculus.v79,
            "Rattus norvegicus" = EnsDb.Rnorvegicus.v79::EnsDb.Rnorvegicus.v79)

        p <- data.table::as.data.table(
            ensembldb::proteins(p,
                    filter = AnnotationFilter::UniprotFilter(protein),
                    columns = c("uniprot_id", "protein_sequence"),
                    return.type = "data.frame"))

        p <- data.table::setnames(x = p,
                                  old = c("uniprot_id", "protein_sequence"),
                                  new = c("seq_name", "sequence"))

        p <- p[!duplicated(p$seq_name), c("seq_name", "sequence")]

    }

    return(p)

}

#########################################################################
###
### Match N/C termini between user input and MEROPS db
##

matchTermini <- function(input, mer) {

    .N <- NULL

    input <- data.table::as.data.table(input)
    mer <- data.table::as.data.table(mer)

    # Find N-terminus matches
    data.table::setkeyv(input, c("protein", "start_pos"))
    data.table::setkeyv(mer, c("Substrate (Uniprot)", "Residue number"))
    N <- input[mer, nomatch = NULL]
    N$terminus <- "N"

    # Find C-terminus matches
    data.table::setkeyv(input, c("protein", "end_pos"))
    data.table::setkeyv(mer, c("Substrate (Uniprot)", "Residue number"))
    C <- input[mer, nomatch = NULL]
    C$terminus <- "C"

    if(N[, .N] == 0 & C[, .N] == 0) stop("No matches found.", call. = FALSE)

    r <- data.table::rbindlist(list(N, C))

    return(r)


}


#########################################################################
###
### Map MEROPS ID to Uniprot
##

mapMEROPSIDs <- function(r, merops_map) {

    seq_name <- .N <- `Protease organism` <- `Protease status` <- NULL

    merops_map <- data.table::as.data.table(merops_map)
    r <- data.table::as.data.table(r)

    merops_map <- merops_map[`Protease organism` %in% r$`Substrate organism`]

    data.table::setkeyv(r, "Protease (MEROPS)")
    data.table::setkeyv(merops_map, "Protease (MEROPS)")

    r <- merops_map[r, nomatch = NULL, allow.cartesian = TRUE]

    r <- r[order(`Protease status`, decreasing = FALSE)]


    return(r)

}
martinry/proteasy documentation built on Feb. 5, 2024, 4:04 p.m.