R/locatePTM.R

Defines functions annotSite locateMod .length_one .covered_set .location .location_start .num_match .rmConfounded .locateCheck locatePTM tidyFasta

Documented in annotSite locateMod locatePTM tidyFasta

#' Read and tidy a FASTA file
#'
#' \code{tidyFasta} reads and tidys FASTA file. Use this function as the first 
#' step in identifying modification sites.
#'
#' @param path A string of path to a FASTA file.
#'
#' @return A data.table with columns named \code{header}, \code{sequence},
#'   \code{uniprot_ac}, \code{uniprot_iso}, \code{entry_name}.
#'
#' @examples
#' tidyFasta(system.file("extdata", "O13297.fasta", package="MSstatsPTM"))
#'
#' @export
#' @importFrom Biostrings readAAStringSet
#' @importFrom data.table data.table is.data.table
#' @importFrom dplyr mutate group_by ungroup bind_rows
tidyFasta = function(path) {
  
  # Check input
  if (missing(path))
    stop(paste0("The input ", sQuote("path"), " is missing"))
  if (!is.character(path))
    stop("Provide the path to the FASTA file as a string")
  if (length(path) != 1)
    stop("Provide only one path to the FASTA file at a time")
  
  aa = readAAStringSet(path)
  aa = as.character(aa)  # named vector
  header = names(aa)
  
  pat_ac = paste0(
    "([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]",
    "([A-Z][A-Z0-9]{2}[0-9]){1,2})"
  )
  pat_iso = paste0(
    "([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]",
    "([A-Z][A-Z0-9]{2}[0-9]){1,2})([-]\\d{1,}){0,1}"
  )
  
  mch_head = regexpr(
    pattern = "([^\\s]*)(?=\\s)", text = header, perl = TRUE
  )
  shead = regmatches(header, m = mch_head)
  
  mch_uniprot = regexpr(pattern = pat_ac, text = shead)
  matched = mch_uniprot != (-1)
  ac = regmatches(shead, regexpr(pattern = pat_ac, text = shead))
  iso = regmatches(shead, regexpr(pattern = pat_iso, text = shead))
  
  trimmed = sub("^(sp|tr)(?=\\|)", "", shead, perl = TRUE)
  trimmed = sub(pat_iso, "", trimmed)
  entry = gsub("\\|", "", trimmed)
  
  data.table(
    header = header[matched], sequence = as.vector(aa[matched]),
    uniprot_ac = ac, uniprot_iso = iso, entry_name = entry[matched]
  )
}

#' Annotate modified sites with associated peptides
#'
#' \code{PTMlocate} annotates modified sites with associated peptides.
#'
#' @param peptide A string vector of peptide sequences. The peptide sequence
#'   does not include its preceding and following AAs.
#' @param uniprot A string vector of Uniprot identifiers of the peptides'
#'   originating proteins. UniProtKB entry isoform sequence is used.
#' @param fasta A data.table with FASTA information. Output of \code{tidyFasta}.
#' @param modResidue A string. Modifiable amino acid residues.
#' @param modSymbol A string. Symbol of a modified site.
#' @param rmConfound A logical. \code{TRUE} removes confounded
#'   unmodified sites, \code{FALSE} otherwise. Default is \code{FALSE}.
#'
#' @return A data frame with three columns: \code{uniprot_iso}, \code{peptide},
#'   \code{site}.
#'
#' @examples
#' fasta = tidyFasta(system.file("extdata", "O13297.fasta", package="MSstatsPTM"))
#' locatePTM("DRVSYIHNDSC*TR", "O13297", fasta, "C", "\\*")
#'
#' @export
locatePTM = function(peptide, uniprot, fasta, modResidue, modSymbol,
                      rmConfound=FALSE) {
  
  if (!.locateCheck(peptide, uniprot, fasta, modResidue, modSymbol))
    stop("Input checking fails!")
  
  peptide_seq = data.table(uniprot_iso = uniprot, peptide = peptide)
  peptide_seq$is_mod = grepl(modSymbol, peptide_seq$peptide)
  peptide_seq$peptide_unmod = gsub(modSymbol, "", peptide_seq$peptide)
  
  # Locate modifiable sites
  col_seq = c("uniprot_iso", "sequence")
  sub_fasta = fasta[fasta$uniprot_iso %in% uniprot, ..col_seq]
  loc = gregexpr(modResidue, sub_fasta$sequence, fixed = TRUE)
  sub_fasta$idx_site_full = lapply(loc, .location_start)
  
  # Locate peptides (use extended AAs for specific matching when possible)
  peptide_fasta = merge(peptide_seq, sub_fasta, all.x = TRUE, 
                         by = "uniprot_iso")
  loc = Map(function(p, s) gregexpr(p, s, fixed = TRUE)[[1]],
             as.list(peptide_fasta$peptide_unmod), as.list(peptide_fasta$sequence))
  n = vapply(loc, .num_match, FUN.VALUE = integer(1))
  peptide_fasta = peptide_fasta[n == 1L, ]
  loc = loc[n == 1L]
  peptide_fasta$idx_peptide = lapply(loc, .location)
  peptide_fasta$aa_start = vapply(loc, .location_start, numeric(1))
  
  # Pattern of modified sites
  mod_pattern = paste0(modResidue, modSymbol)
  
  # Locate modifiable, modified sites (AA residues) associated with peptides
  peptide_fasta$idx_site = Map(.covered_set,
                                peptide_fasta$idx_site_full, 
                                peptide_fasta$idx_peptide)
  peptide_fasta$idx_mod = Map(
    function(p, a) locateMod(p, a, residueSymbol = mod_pattern),
    as.list(peptide_fasta$peptide), as.list(peptide_fasta$aa_start)
  )
  peptide_fasta$mod_aa = Map(
    function(s, i) if (length(i) == 0) character() else substring(s, i, i),
    as.list(peptide_fasta$sequence), peptide_fasta$idx_mod
  )
  # Annotate modified sites
  peptide_fasta$len_site = lapply(
    peptide_fasta$idx_site_full, function(x) nchar(x[length(x)])
  )
  peptide_fasta$site = Map(annotSite,
                            peptide_fasta$idx_mod, peptide_fasta$mod_aa, 
                            peptide_fasta$len_site)
  peptide_fasta$site = as.character(peptide_fasta$site)
  col_fasta = c("uniprot_iso", "peptide", "peptide_unmod", "is_mod",
                 "idx_site", "idx_mod", "site")
  peptide_fasta = peptide_fasta[, ..col_fasta]
  
  .rmConfounded(peptide_fasta, rmConfound)
}

#' @noRd
.locateCheck = function(peptide, uniprot, fasta, modResidue, modSymbol) {
  if (missing(peptide))
    stop("Input peptide is missing!")
  if (missing(uniprot))
    stop("Input uniprot is missing!")
  if (missing(fasta))
    stop("Input fasta is missing!")
  if (missing(modResidue))
    stop("Input modResidue is missing!")
  if (missing(modSymbol))
    stop("Input modSymbol is missing!")
  if (!is.character(peptide))
    stop("Please provide peptide sequence as character in peptide!")
  if (!is.character(uniprot))
    stop("Please provide Uniprot protein ID as character in uniprot!")
  if (length(peptide) != length(uniprot))
    stop("peptide and uniprot must be of the same length")
  if (!is.data.table(fasta))
    stop("Please provide the FASTA information in a data table!")
  if (!all(c("uniprot_iso", "sequence") %in% names(fasta)))
    stop("Uniprot_iso or sequence is missing from FASTA data table!")
  TRUE
}

#' @noRd
#' @importFrom dplyr group_by ungroup bind_rows
.rmConfounded = function(peptideFasta, rmConfound) {
  # Handle confounded unmodified sites
  col_res = c("uniprot_iso", "peptide", "site")
  if (rmConfound) {
    by_prot = group_by(peptideFasta, .data$uniprot_iso)
    by_prot = mutate(
      by_prot, idx_mod_full = list(unique(unlist(.data$idx_mod)))
    )
    by_prot = ungroup(by_prot)
    by_prot = by_prot[!by_prot$is_mod, ]
    cnfnd = vector("logical", nrow(by_prot))
    for (i in seq_along(cnfnd)) {
      cnfnds = by_prot$idx_site[[i]] %in% by_prot$idx_mod_full[[i]]
      cnfnd[i] = any(cnfnds)
    }
    s_unmod = by_prot[!cnfnd, col_res]
    
    n1 = vapply(peptideFasta$idx_mod, .length_one, FUN.VALUE = logical(1))
    s_mod = peptideFasta[peptideFasta$is_mod & n1, ..col_res]
    res = bind_rows(s_mod, s_unmod)
  } else {
    n1 = vapply(peptideFasta$idx_mod, .length_one, FUN.VALUE = logical(1))
    res = peptideFasta[!peptideFasta$is_mod | n1, ..col_res]
  }
  res
}

#' @noRd
.num_match = function(x) {
  ifelse(
    is.na(x) || identical(as.vector(x), -1L),
    0L, length(attr(x, "match.length"))
  )
}

#' @noRd
.location_start = function(x) {
  start = as.vector(x)
  if (identical(start, -1L)) {
    return(integer())
  }
  start
}

#' @noRd
.location = function(x) {
  start = as.vector(x)
  if (identical(start, -1L)) {
    return(cbind(start = integer(), end = integer()))
  }
  end = as.vector(x) + attr(x, "match.length") - 1
  
  cbind(start = start, end = end)
}

#' @noRd
.covered_set = function(x, y) {
  x[x >= y[1] & x <= y[2]]
}

#' @noRd
.length_one = function(x) length(x) == 1

#' Locate modified sites with a peptide
#'
#' \code{locateMod} locates modified sites with a peptide.
#'
#' @param peptide A string. Peptide sequence.
#' @param aaStart An integer. Starting index of the peptide.
#' @param residueSymbol A string. Modification residue and denoted symbol.
#'
#' @return A string.
#'
#' @examples
#' locateMod("P*EP*TIDE", 3, "\\*")
#'
#' @export
locateMod = function(peptide, aaStart, residueSymbol) {
  x = gregexpr(residueSymbol, peptide)[[1]]
  start = as.vector(x)
  if (identical(start, -1L)) {
    return(integer())
  }
  start - seq_along(start) + aaStart
}

#' Annotate modification site
#'
#' \code{annotSite} annotates modified sites as their residues and locations.
#'
#' @param aaIndex An integer vector. Location of the sites.
#' @param residue A string vector. Amino acid residue.
#' @param lenIndex An integer. Default is \code{NULL}
#'
#' @return A string.
#'
#' @examples
#' annotSite(10, "K")
#' annotSite(10, "K", 3L)
#'
#' @export
annotSite = function(aaIndex, residue, lenIndex=NULL) {
  if (length(aaIndex) == 0) {
    return("None")
  } else {
    if (length(residue) != 1 && length(aaIndex) != length(residue))
      stop("Lengths of aaIndex and residue don't match!")
    if (!is.null(lenIndex)) {
      if (!is.integer(lenIndex))
        stop("lenIndex should be an integer!")
      if (max(nchar(aaIndex)) > lenIndex)
        stop("lenIndex doesn't reserve enough space for aaIndex!")
      
      aa_idx_pad = formatC(
        aaIndex, width = lenIndex, format = "d", flag = "0"
      )
      site = paste0(residue, aa_idx_pad, collapse = "-")
    } else {
      site = paste0(residue, aaIndex, collapse = "-")
    }
  }
  site
}
Vitek-Lab/MSstatsPTM documentation built on April 21, 2024, 6:45 a.m.