#' Create an input file for KSP
#'
#' Creates an input file to use for prediction with [KSP](https://doi.org/10.1186/s12864-020-06895-2).
#' Wrapper around [readr::write_tsv()].
#'
#' @param data Data frame in long format, containing gene names and sequence windows.
#' @param path Path to write the file to, including file name and extension.
#' @param name_col Name of the column that contains gene names.
#' @param seq_col Name of the column that contains sequence windows.
#'
#' @return Returns path that file was written to, invisibly.
#' @export
#'
#' @examples
#' kinsub_path <- system.file('extdata', 'Kinase_Substrate_Dataset_head', package = 'phosphocie')
#' kinsub <- read_kinsub(kinsub_path)
#' tmp <- tempfile()
#'
#' write_ksp_input(kinsub, tmp, name_col = 'gene', seq_col = 'fragment_15')
write_ksp_input <- function(data, path, name_col, seq_col) {
if (any(!c(name_col, seq_col) %in% colnames(data))) {
rlang::abort(glue::glue("Please ensure your data contains the supplied column names. \nSupplied names: {paste(name_col, seq_col, sep = ', ')}\nDetected colnames: {paste(colnames(data), collapse = ', ')}"))
}
data %>%
dplyr::select(dplyr::all_of(c(name_col, seq_col))) %>%
readr::write_tsv(path, col_names = FALSE)
print(glue::glue('file created at {path}'))
return(invisible(path))
}
#' Create an input file for KSP's PWM mode
#'
#' Creates an input file to use for prediction with
#' [KSP](https://doi.org/10.1186/s12864-020-06895-2)'s position-weight matrix-based scoring tool.
#' Wrapper around [readr::write_tsv()].
#'
#' @param data Data frame in long format, containing gene names and sequence windows.
#' @param path Path to write the file to, including file name and extension.
#' @param name_col Name of the column that contains gene names.
#' @param seq_col Name of the column that contains sequence windows.
#'
#' @return Returns path that file was written to, invisibly.
#' @export
#'
#' @examples
#' kinsub_path <- system.file('extdata', 'Kinase_Substrate_Dataset_head', package = 'phosphocie')
#' kinsub <- read_kinsub(kinsub_path)
#' tmp <- tempfile()
#'
#' write_pwm_input(kinsub, tmp, name_col = 'gene', seq_col = 'fragment_15')
write_pwm_input <- function(data, path, name_col, seq_col) {
if (any(!c(name_col, seq_col) %in% colnames(data))) {
rlang::abort(glue::glue("Please ensure your data contains the supplied column names. \nSupplied names: {paste(name_col, seq_col, sep = ', ')}\nDetected colnames: {paste(colnames(data), collapse = ', ')}"))
}
data %>%
dplyr::select(dplyr::all_of(c(name_col, seq_col))) %>%
dplyr::bind_cols(placeholder = ' ', .) %>%
readr::write_tsv(path, col_names = FALSE)
print(glue::glue('file created at {path}'))
return(invisible(path))
}
#' Retrieve full protein sequences from UniProt
#'
#' From a vector of uniprot accession IDs, retrieves FASTAs for each protein from
#' the [EMBL Proteins API](https://www.ebi.ac.uk/proteins/api/doc/index.html) and
#' concatenates these into one massive FASTA file.
#' Can serve as NetPhorest input, but [build_fastas()] is the recommended alternative
#' to predict for known sites rather than entire proteins.
#'
#' @param uniprot_acc Character vector of UniProt accession IDs. Supports isoform IDs like `Q02297-6`.
#' @param path Path to write the file to, including file name and extension.
#'
#' @details
#' * Filters out duplicated IDs and invalid IDs.
#' * Requests in batches of 100, which is the max for the API.
#' * If any batch fails, the process is stopped and the file is left as is, containing all
#' successful batches up to then.
#' * Between each batch request, waits for 0.75 seconds to stay below rate limits.
#'
#' @return Returns path of the output FASTA, invisibly.
#' @export
#'
#' @examples
#' kinsub_path <- system.file('extdata', 'Kinase_Substrate_Dataset_head', package = 'phosphocie')
#' kinsub <- read_kinsub(kinsub_path)
#' tmp <- tempfile()
#'
#' \dontrun{
#' retrieve_fastas(kinsub$acc_id, tmp)
#' }
retrieve_fastas <- function(uniprot_acc, path) {
# Filter uniprot IDs
uniprot_acc <- unique(uniprot_acc)
uniprot_regex <- '[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}'
if (any(!stringr::str_detect(uniprot_acc, uniprot_regex))) {
warning(glue::glue("Accession IDs with invalid uniprot ID formats detected. These will be ignored. {paste(uniprot_acc[!str_detect(uniprot_acc, uniprot_regex)], collapse = ',')}"))
uniprot_acc <- uniprot_acc[stringr::str_detect(uniprot_acc, uniprot_regex)]
}
if (any(nchar(uniprot_acc) > 10)) {
warning(glue::glue("Accession IDs with more than 10 characters detected, which is not allowed. These will be ignored. {paste(uniprot_acc[nchar(uniprot_acc) > 10])}" ))
uniprot_acc <- uniprot_acc[nchar(uniprot_acc) <= 10]
}
# Loop over and retrieve for every batch of 100 IDs
sections <- split(uniprot_acc, ceiling(seq_along(uniprot_acc)/100))
counter <- 0
open_file <- file(path, open = 'w+')
for (section in sections) {
counter <- counter + 1
print(glue::glue('Requesting section {counter}/{length(sections)}'))
requestURL <- glue::glue("https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=-1&accession={paste(section, collapse = ',')}")
r <- httr::GET(requestURL, httr::accept("text/x-fasta"))
if (httr::http_error(r)) {
close(open_file)
print(glue::glue("Problem with section {counter}. Accession IDs of current section: \n {paste(section, collapse = ',')}"))
httr::stop_for_status(r)
}
writeLines(httr::content(r, as = 'text', encoding = "UTF-8"), open_file, sep = '')
Sys.sleep(0.75)
}
close(open_file)
print(glue::glue('Successfully retrieved fasta, saved at {path}'))
return(invisible(path))
}
#' Create a FASTA file with sequence fragments
#'
#' From (potential) sites and their surrounding amino acids, create a FASTA-conforming file.
#' Requires a column of unique header values for each site.
#'
#'
#' @param data Data frame in long format, containing unique names and sequence windows.
#' @param path Path to write the file to, including file name and extension.
#' @param name_col Name of the column that contains metadata about the sequence.
#' Used as a part of header_pattern by default
#' If missing, uses header_pattern to construct unique IDs.
#' @param seq_col Name of the column that contains the sequence windows.
#' @param header_pattern Pattern for use in [glue::glue()], used to construct a new unique header column
#' based on current columns. Default uses name_col and appends 5 aa window
#' around site for unique identification by [read_netphorest()].
#' Any whitespace will be replaced by underscores.
#' If set to NULL, `header_pattern` will just be '{name_col}'.
#'
#' @return Returns the input data with new headers included as fasta_id, invisibly.
#' @export
#'
#' @examples
#' kinsub_path <- system.file('extdata', 'kinase_substrate_dataset_head', package = 'phosphocie')
#' kinsub <- read_kinsub(kinsub_path)
#' tmp <- tempfile()
#'
#' build_fastas(kinsub, tmp, name_col = 'unique_id', seq_col = 'fragment_15')
#'
#' build_fastas(kinsub, tmp, seq_col = 'fragment_15', header_pattern = '{acc_id}|{gene}|{substrate}|{residue}{position}')
build_fastas <- function(data, path, name_col, seq_col, header_pattern = '{.data[[name_col]]}|{get_middle_fragment(.data[[seq_col]], 7)}') {
if (is.null(header_pattern)) {
header_pattern <- '{name_col}'
}
if (missing(name_col)) {
if (stringr::str_detect(header_pattern, 'name_col')) {
rlang::abort('If name_col is not supplied, header_pattern cannot refer to name_col and must be changed.')
}
}
# Create header column
data <- dplyr::mutate(data, fasta_id = glue::glue(header_pattern))
if (any(stringr::str_detect(data$fasta_id, '\\s'))) {
data$fasta_id <- stringr::str_replace_all(data$fasta_id, '\\s', '_')
warning('Any whitespaces in the fasta headers have been replaced by underscores.')
}
name_col <- 'fasta_id'
if (anyDuplicated(data[[name_col]]) > 0) {
rlang::abort('Name column created with header_pattern is not unique, please supply a different pattern.')
}
# Check column names and data
if (any(!c(name_col, seq_col) %in% colnames(data))) {
rlang::abort(glue::glue("Please ensure your data contains the supplied column names. \nSupplied names: {paste(name_col, seq_col, sep = ', ')}\nDetected colnames: {paste(colnames(data), collapse = ', ')}"))
}
if (any(nchar(data[[seq_col]]) > 70)) {
rlang::abort("Peptide of 70+ AA detected. Please check your data.")
}
# Remove any filling underscores and other junk by extracting first alphanumeric group
data[[seq_col]] <- stringr::str_extract(data[[seq_col]], '[:alpha:]+')
# Walk over name and sequence at once and write to file
open_file <- file(path, open = 'w+')
purrr::pwalk(
dplyr::select(data, dplyr::all_of(c(name_col, seq_col))),
~writeLines(
text = c(
paste0('>', ..1), # Write start line
toupper(..2) # Write sequence line, cut off any accidental
),
con = open_file)
)
close(open_file)
print(glue::glue('Successfully built fasta, saved at {path}'))
return(invisible(data))
}
#' Extract middle fragment from each string
#'
#' Returns an uppercase slice of size `size` from each string supplied, centered
#' on the middle position of the string. Useful to slice out middles from
#' sequence fragments.
#'
#' @param string_vec A character vector of strings, all at least size `size`.
#' @param size The size of the returned middle fragments, as an uneven integer.
#'
#' @return An all-uppercase character vector, of the same length as the `string_vec` input.
#' @export
#'
get_middle_fragment <- function(string_vec, size) {
if (size %% 2 != 1) {
rlang::abort('Please provide an uneven size.')
}
side_size <- (size-1)/2
mid_positions <- ceiling(nchar(string_vec)/2)
toupper(stringr::str_sub(string_vec, mid_positions-side_size, mid_positions+side_size))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.