
Defines functions .check_presense_of_flanking_AAs .get_non_AA_characters

utils::globalVariables(c(".", "trimmedPeptide", "x",
                         "First_AA", "Last_AA",
                         "ProtSeq", "cleanSeq", "PepLoc", "ModShift",
                         "SiteLoc", "ModAAs", "SiteLoc", "ModAAs",
                         "Site", "SiteCollapsed", "SiteCollapsedFirst"))

.get_non_AA_characters <- function(object, 
                                  extra_allowed_chars = ""){
    # returns any non AA characters from the presumably peptide sequences
    present_chars <- object[[column_name]] %>%
        unique() %>%
        paste0(collapse = '') %>% 
        strsplit(split='') %>% 
        `[[`(1) %>% 
    other_chars <- setdiff(present_chars, 
                           c(Biostrings::AA_STANDARD, extra_allowed_chars))

.check_presense_of_flanking_AAs <- function(object, column_name){
    # TRUE if peptides in X.XXXXXXX.X format
    flanking_AA_present <- object[[column_name]] %>%
        unique() %>%
        grepl(".\\..+\\..", .) %>%

.map_peptide_position <- function(object, fasta, accession_col = "accession"){
    object <- .make_clean_seq(object, "peptide")
    x <- psms(object)
    # protein ID in `accession`
    # peptide sequence with flanking AAs in `peptide`
    # check for decoy entries
    decoy_acc <- apply_filter(object, "isDecoy")[[accession_col]]
    if (length(decoy_acc) > 0 & !any(decoy_acc %in% names(fasta))) {
        fasta_rev <- reverse(fasta)
        names(fasta_rev) <- paste0("XXX_", names(fasta))
        fasta <- c(fasta, fasta_rev)
    # check if fasta entry names are unique
        stop("FASTA entry names are not unique!\n")
    # check if there is at least some agreement in IDs
    if(length(intersect(x[[accession_col]], names(fasta))) == 0){
        stop("There is zero overlap in protein IDs and FASTA entry names!\n")
    # merger of identifications and FASTA
    prot_pep <- x %>%
        select(!!accession_col, cleanSeq) %>%
    res <- fasta %>%
        as.data.frame() %>%
        rownames_to_column(accession_col) %>%
        dplyr::rename(ProtSeq = x) %>%
        mutate(ProtLen = str_length(ProtSeq)) %>%
        left_join(prot_pep, ., by = accession_col)
    # locating peptide within protein
    res <- res %>%
        mutate(First_AA = map2(ProtSeq, 
                               ~ as.numeric(str_locate_all(.x, .y)[[1]][,1])),
               Last_AA = map2(First_AA, nchar(cleanSeq) - 1, `+`),
               First_AA_First = map(First_AA, ~ .[1]) %>% as.numeric(),
               Last_AA_First = map(Last_AA, ~ .[1]) %>% as.numeric())
    # drop Protein Sequences
    res <- res %>%
    # linking back to the main MSnID object by accession and cleanSeq
    # the linking is two-step to avoid problems with RAM
    res_full <- x %>%
        select(!!accession_col, cleanSeq) %>%
        left_join(res, by=c(accession_col, "cleanSeq"))
    # clean-up
    # VERSION 1
    columns_to_add <- c("First_AA", "Last_AA", "First_AA_First", "Last_AA_First", "ProtLen")
    # assign
    object_psms <- copy(object@psms) # preventing side-effect on original object
    object_psms <- data.table(object_psms) # safety step to avoid "Invalid .internal.selfref"
    object_psms[, (columns_to_add) := res_full[,columns_to_add]]
    object@psms <- object_psms
