R/xcombr_db.R

#' @import dplyr
#' @importFrom tidyr unnest
#' @importFrom stringr str_detect str_match str_replace_all
#' @importFrom tibble as_tibble
#' @importFrom cleaver cleavageRanges cleave
#'
#' @include xlinkTbl.R

.xcombr_db <- function(x,
                       xlSpecificity = "Amine:Amine",
                       linkType = "both",
                       minLength = 6,
                       maxLength = 24,
                       peptideTerminalLink = FALSE,
                       proteinTerminalLink = TRUE,
                       customSpecificity = NULL,
                       digest = T,
                       ...,
                       max50 = TRUE) {

    # Define valid arg values --------------------------------------------------
    linkTypeValues <- c("inter", "intra", "both")
    xlSpecificityValues <- c(xlinkTbl$name, "custom")

    # Check args ---------------------------------------------------------------
    .checkArgs(xlSpecificity,
               linkType,
               minLength,
               maxLength,
               peptideTerminalLink,
               proteinTerminalLink,
               customSpecificity,
               max50,
               # possible values for arguments:
               linkTypeValues,
               xlSpecificityValues)

    # Retrieve Regular expressions based on `xlSpecificity`
    if(tolower(xlSpecificity) == "custom") {
        xlReg1 <- customSpecificity[1]
        xlReg2 <- customSpecificity[2]
    } else {
        xlReg1 <- xlinkTbl$regex1[tolower(xlinkTbl$name) == tolower(xlSpecificity)]
        xlReg2 <- xlinkTbl$regex2[tolower(xlinkTbl$name) == tolower(xlSpecificity)]
    }

    # Perform Digest using cleaver (but return a tibble)
    if(digest) {
        seqTbl <- as_tibble(as.data.frame(cleave(x, ..., unique = F)))
        pos <- as_tibble(as.data.frame(cleavageRanges(x, ...)))

        seqTbl <- cbind(seqTbl, pos[ , c("start", "end", "width")])

    } else { # or not
        seqTbl <- tibble(group = 1:length(x),
                         group_name = names(x),
                         value = as.character(x),
                         start = 1,
                         end = nchar(value),
                         width = nchar(value))

    }

    seqTbl <- seqTbl %>%
        rename(name = group_name,
               seq = value) %>%
        select(-group) %>%
        mutate(seq = toupper(seq))


    # Filter peptides based on args --------------------------------------------

    if(peptideTerminalLink) {
        seqTbl <- seqTbl %>%
            mutate(seq = paste0("n", seq, "c"))
    } else if(proteinTerminalLink) {
        seqTbl <- seqTbl %>%
            group_by(name) %>%
            mutate(seq = ifelse(start == min(start), paste0("n", seq), seq),
                   seq = ifelse(end == max(end), paste0(seq, "c"), seq))
    }


    seqTbl <- seqTbl %>%
        ungroup() %>%
        mutate(range = paste0("{", start, "-", end, "}")) %>%
        filter(width >= minLength,
               width <= maxLength) %>%
        select(-start, -end, -width)

    # Create 2 tables; one for each side of the crosslink
    links <- seqTbl %>%
        filter(str_detect(seq, xlReg1)) %>%
        mutate(seq = str_replace_all(seq, "[nc]", "")) %>%
        group_by(name, seq) %>%
        summarize(range = paste(range, collapse = ","))

    if(xlReg1 != xlReg2) {
        link2 <- seqTbl %>%
            filter(str_detect(seq, xlReg2)) %>%
            mutate(seq = str_replace_all(seq, "[nc]", "")) %>%
            group_by(name, seq) %>%
            summarize(range = paste(range, collapse = ","))

        links <- links %>% rbind(link2)
    }


    # Create cosslinked pairs --------------------------------------------------
    combos <- as_tibble(expand.grid(seq1 = links$seq,
                                    seq2 = links$seq,
                                    stringsAsFactors = F)) %>%
        left_join(links, by = c("seq1" = "seq")) %>%
        left_join(links, by = c("seq2" = "seq"), suffix = c("1", "2")) %>%
        distinct() %>%
        group_by(seq1, seq2, name1, name2) %>%
        mutate(shortName1 = str_match(name1, "(^\\S+)")[ , 2],
               shortName2 = str_match(name2, "(^\\S+)")[ , 2],
               accession = paste(sort(c(shortName1, shortName2)), collapse = "_"),
               type = ifelse(name1 == name2, "intra", "inter"),
               seq = paste0(seq1, seq2),
               description = paste0("a=", seq1, " ", range1, " ", name1, " cx ",
                                    "A=", seq2, " ", range2, " ", name2),
               header = paste0(accession, " ", description)) %>%
        ungroup() %>%
        {if(linkType != "both") filter(., type == linkType) else .} %>%
        select(seq, header)

    # return AAStringSet ------------------------------------------------------------
    seqs <- combos$seq
    names(seqs) <- combos$header
    seqs <- AAStringSet(seqs)

    return(seqs)
}




# Argument Checking
.checkArgs <- function(xlSpecificity,
                       linkType,
                       minLength,
                       maxLength,
                       peptideTerminalLink,
                       proteinTerminalLink,
                       customSpecificity,
                       max50,
                       linkTypeValues,
                       xlSpecificityValues){
    # xlSpecificity
    if(!is.character(xlSpecificity) |
       length(xlSpecificity) != 1) {
        stop("xlSpecificity must be a character vector of length 1")
    } else if(!(tolower(xlSpecificity) %in% tolower(xlSpecificityValues))) {
        stop("Unrecognized xlSpecificity. Refer to ?xcomb for options.")
    }

    #linkType
    if(!is.character(linkType) |
       length(linkType) != 1) {
        stop("linkType must be a character vector of length 1")
    } else if(!(tolower(linkType) %in% tolower(linkType))) {
        stop("Unrecognized linkType. Refer to xcombr_specificities() for options.")
    }

    # minLength (|| so we don't get extra warnings from conversions)
    if(length(minLength) != 1 ||
       !is.numeric(minLength) ||
       minLength != as.integer(minLength) ||
       minLength < 0) {
        stop("minLength must be a single positive integer value")
    }

    # maxLength
    if(length(maxLength) != 1 ||
       !is.numeric(maxLength) ||
       maxLength != as.integer(maxLength) ||
       maxLength < 0) {
        stop("maxLength must be a single positive integer value")
    } else if(maxLength < minLength) {
        stop("maxLength must be greater than or equal to minLength")
    }

    #peptideTerminalLink
    if(length(peptideTerminalLink) != 1 ||
       !is.logical(peptideTerminalLink)) {
        stop("peptideTerminalLink must be a logical value (TRUE/FALSE)")
    }

    #proteinTerminalLink
    if(length(proteinTerminalLink) != 1 ||
       !is.logical(proteinTerminalLink)) {
        stop("proteinTerminalLink must be a logical value (TRUE/FALSE)")
    }

    #proteinTerminalLink
    if(length(digest) != 1 ||
       !is.logical(digest)) {
        stop("digest must be a logical value (TRUE/FALSE)")
    }

    #customSpecificity
    if(tolower(xlSpecificity == "custom")) {
        if(is.null(customSpecificity)) {
            stop("customSpecificity is required when xlSpecificity = 'custom'.")
        } else if(!is.character(customSpecificity) |
                  length(customSpecificity) != 2) {
            stop("customSpecificity must be a character vector of length 2.")
        }
    }

    #max50
    if(!is.logical(max50) || length(max50) != 1) {
        stop("max50 must be a logical value (TRUE/FALSE).")
    }
}
wfondrie/xCombR documentation built on May 15, 2019, 5:34 p.m.