#' Query Phobius web server.
#'
#' Phobius web server is a combined transmembrane topology and signal peptide
#' (N-sp) predictor. Currently only "normal prediction" of signal peptides is
#' supported by the function.
#'
#' @aliases get_phobius get_phobius.default get_phobius.character
#' get_phobius.data.frame get_phobius.list
#' @param data A data frame with protein amino acid sequences as strings in one
#' column and corresponding id's in another. Alternatively a path to a .fasta
#' file with protein sequences. Alternatively a list with elements of class
#' "SeqFastaAA" resulting from \code{\link[seqinr]{read.fasta}} call.
#' Should be left blank if vectors are provided to sequence and id arguments.
#' @param sequence A vector of strings representing protein amino acid sequences,
#' or the appropriate column name if a data.frame is supplied to data argument.
#' If .fasta file path, or list with elements of class "SeqFastaAA" provided to
#' data, this should be left blank.
#' @param id A vector of strings representing protein identifiers, or the
#' appropriate column name if a data.frame is supplied to data argument. If
#' .fasta file path, or list with elements of class "SeqFastaAA" provided to
#' data, this should be left blank.
#' @param progress Boolean, whether to show the progress bar, at default set to
#' FALSE.
#' @param ... currently no additional arguments are accepted apart the ones
#' documented bellow.
#'
#' @return A data frame with columns:
#' \describe{
#' \item{Name}{Character, name of the submitted sequence.}
#' \item{tm}{Integer, the number of predicted transmembrane segments.}
#' \item{sp}{Character, Y/0 indicator if a signal peptide was predicted or not.}
#' \item{prediction}{Character string, predicted topology of the protein.}
#' \item{cut_site}{Integer, first amino acid after removal of the signal peptide}
#' \item{is.phobius}{Logical, did Phobius predict the presence of a signal
#' peptide}}
#'
#' @details
#' The topology (prediction column of the result) is given as the position of
#' the transmembrane helices separated by 'i' if the loop is on the cytoplasmic
#' or 'o' if it is on the non-cytoplasmic side. A signal peptide is given by the
#' position of its h-region separated by a n and a c, and the position of the
#' last amino acid in the signal peptide and the first of the mature protein
#' separated by a /.
#'
#' @note This function creates temporary files in the working directory.
#'
#' @source \url{https://phobius.sbc.su.se/}
#'
#' @references Kall O. Krogh A. Sonnhammer E. L. L. (2004) A Combined
#' Transmembrane Topology and Signal Peptide Prediction Method. Journal
#' of Molecular Biology 338(5): 1027-1036
#' @import seqinr
#' @import httr
#' @import stringr
#' @import xml2
#' @importFrom methods is
get_phobius <- function(data, ...) {
if (missing(data) || is.null(data)) {
get_phobius.default(...)
} else {
UseMethod("get_phobius")
}
}
#' @rdname get_phobius
#' @method get_phobius character
#' @export
get_phobius.character <- function(data, progress = FALSE, ...) {
if (missing(progress)) {
progress <- FALSE
}
if (length(progress) > 1) {
progress <- FALSE
warning("progress should be of length 1, setting to default:
progress = FALSE",
call. = FALSE
)
}
if (!is.logical(progress)) {
progress <- as.logical(progress)
warning("progress is not logical, converting using 'as.logical'",
call. = FALSE
)
}
if (is.na(progress)) {
progress <- FALSE
warning("progress was set to NA, setting to default: progress = FALSE",
call. = FALSE
)
}
if (length(data) > 1) {
stop("one fasta file per function call can be supplied",
call. = FALSE
)
}
if (file.exists(data)) {
file_name <- data
} else {
stop("cannot find file in the specified path",
call. = FALSE
)
}
file_list <- split_fasta(
path_in = file_name,
path_out = "temp_phob_",
num_seq = 500
)
len <- length(file_list)
if (grepl("temp_", file_name)) {
unlink(file_name)
}
if (progress) {
pb <- utils::txtProgressBar(
min = 0,
max = len,
style = 3
)
}
url <- "https://phobius.sbc.su.se/cgi-bin/predict.pl"
collected_res <- vector("list", len)
for (i in seq_along(len)) {
file_up <- httr::upload_file(file_list[i])
res <- httr::POST(
url = url,
encode = "multipart",
body = list(
`protfile` = file_up,
`format` = "short"
)
)
res <- httr::content(res,
as = "parsed"
)
res <- xml2::xml_text(
res,
"//pre"
)
res <- unlist(strsplit(
as.character(res),
"\n"
))
res <- res[(grep(
"SEQENCE",
res
) + 1):(grep(
"_uacct",
res
) - 1)]
res <- strsplit(
res,
" +"
)
res <- do.call(
rbind,
res
)
res <- as.data.frame(res,
stringsAsFactors = FALSE
)
colnames(res) <- c(
"Name",
"tm",
"sp",
"prediction"
)
collected_res[[i]] <- res
unlink(file_list[i])
if (progress) {
utils::setTxtProgressBar(pb, i)
}
}
if (progress) {
close(pb)
}
collected_res <- do.call(
rbind,
collected_res
)
collected_res$cut_site <- stringr::str_extract(
collected_res$prediction,
"(?<=/)\\d+"
)
collected_res$is.phobius <- collected_res$sp == "Y"
return(collected_res)
}
#' @rdname get_phobius
#' @method get_phobius data.frame
#' @export
get_phobius.data.frame <- function(data, sequence, id, ...) {
if (missing(sequence)) {
stop("the column name with the sequences must be specified",
call. = FALSE
)
}
if (missing(id)) {
stop("the column name with the sequence id's must be specified",
call. = FALSE
)
}
id <- as.character(substitute(id))
sequence <- as.character(substitute(sequence))
if (length(id) != 1L) {
stop("only one column name for 'id' must be specifed",
call. = FALSE
)
}
if (length(sequence) != 1L) {
stop("only one column name for 'sequence' must be specifed",
call. = FALSE
)
}
id <- if (id %in% colnames(data)) {
data[[id]]
} else {
stop("specified 'id' not found in data",
call. = FALSE
)
}
id <- as.character(id)
sequence <- if (sequence %in% colnames(data)) {
data[[sequence]]
} else {
stop("specified 'sequence' not found in data",
call. = FALSE
)
}
sequence <- toupper(as.character(sequence))
sequence <- sub(
"\\*$",
"",
sequence
)
aa_regex <- "[^ARNDCQEGHILKMFPSTWYV]"
if (any(grepl(aa_regex, sequence))) {
warning(paste("sequences: ",
paste(id[grepl(
aa_regex,
sequence
)],
collapse = ", "
),
" contain symbols not corresponding to amino acids",
sep = ""
),
call. = FALSE
)
}
file_name <- paste("temp_",
gsub(
"^X",
"",
make.names(Sys.time())
),
".fasta",
sep = ""
)
seqinr::write.fasta(
sequence = strsplit(sequence, ""),
name = id,
file = file_name
)
res <- get_phobius.character(
file_name,
...
)
return(res)
}
#' @rdname get_phobius
#' @method get_phobius list
#' @export
get_phobius.list <- function(data, ...) {
file_name <- paste("temp_",
gsub(
"^X",
"",
make.names(Sys.time())
),
".fasta",
sep = ""
)
if (is(data[[1]],"SeqFastaAA")) {
dat <- lapply(data,
paste0,
collapse = ""
)
id <- names(dat)
sequence <- toupper(as.character(unlist(dat)))
sequence <- sub(
"\\*$",
"",
sequence
)
aa_regex <- "[^ARNDCQEGHILKMFPSTWYV]"
if (any(grepl(aa_regex, sequence))) {
warning(paste("sequences: ",
paste(id[grepl(
aa_regex,
sequence
)],
collapse = ", "
),
" contain symbols not corresponding to amino acids",
sep = ""
),
call. = FALSE
)
}
seqinr::write.fasta(
sequence = strsplit(sequence, ""),
name = id,
file = file_name
)
} else {
stop("only lists containing objects of class SeqFastaAA are supported")
}
res <- get_phobius.character(
file_name,
...
)
return(res)
}
#' @rdname get_phobius
#' @method get_phobius default
#' @export
get_phobius.default <- function(data = NULL,
sequence,
id,
...) {
if (missing(sequence)) {
stop("protein sequence must be provided to obtain predictions",
call. = FALSE
)
}
if (missing(id)) {
stop("protein id must be provided to obtain predictions",
call. = FALSE
)
}
id <- as.character(id)
sequence <- toupper(as.character(sequence))
if (length(sequence) != length(id)) {
stop("id and sequence vectors are not of same length",
call. = FALSE
)
}
sequence <- sub(
"\\*$",
"",
sequence
)
aa_regex <- "[^ARNDCQEGHILKMFPSTWYV]"
if (any(grepl(aa_regex, sequence))) {
warning(paste("sequences: ",
paste(id[grepl(
aa_regex,
sequence
)],
collapse = ", "
),
" contain symbols not corresponding to amino acids",
sep = ""
),
call. = FALSE
)
}
file_name <- paste("temp_",
gsub(
"^X",
"",
make.names(Sys.time())
),
".fasta",
sep = ""
)
seqinr::write.fasta(
sequence = strsplit(sequence, ""),
name = id,
file = file_name
)
res <- get_phobius.character(
file_name,
...
)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.