#' Retrieve protein sequences and data from GenBank and Uniprot
#'
#' This function is used to retrieve data from
#' Genbank's protein database for
#' given protein IDs. If an ID is not available from Genbank the function will
#' try to retrieve it from Uniprot.
#'
#' Queries are processed one by one (rather than in batch) to enable treatment
#' of individual inconsistencies (e.g., wrong UIDs, queries that return a
#' different identifier, etc.). This makes this routine substantially slower,
#' but considerably more robust to errors.
#'
#' @param uids A character vector with protein IDs, or a _data.table_ object
#' of class *LBCE_dt*, generated by [get_LBCE()].
#' @param save_folder path to folder for saving the results.
#'
#' @return A list object containing a data frame with the extracted proteins
#' plus a vector of IDs that were not successfully retrieved.
#'
#' @author Felipe Campelo (\email{f.campelo@@aston.ac.uk})
#'
#' @export
#'
get_proteins <- function(uids, save_folder = NULL){
# ========================================================================== #
# Sanity checks and initial definitions
t0 <- Sys.time()
assertthat::assert_that(is.null(save_folder) | (is.character(save_folder)),
is.null(save_folder) | length(save_folder) == 1)
if(is.character(uids)){
assertthat::assert_that(length(uids) >= 1)
} else {
assertthat::assert_that("LBCE_dt" %in% class(uids))
uids <- uids$protein_id
}
# Check save folder and create file names
if(!is.null(save_folder)) {
if(!dir.exists(save_folder)) dir.create(save_folder)
ymd <- gsub("-", "", Sys.Date())
df_file <- paste0(normalizePath(save_folder), "/00_proteins_", ymd, ".rds")
errfile <- paste0(normalizePath(save_folder),
"/00_prots_not_retrieved_", ymd, ".rds")
tmpf <- tempfile(fileext = ".rds", tmpdir = save_folder)
}
errlist <- seq_along(uids)
reslist <- vector("list", length = length(uids))
nerr <- Inf
# First try retrieving from NCBI/protein
cat("Trying to retrieve", length(reslist), "proteins",
"\nStarted at", as.character(t0),
"\nThis may take a while...")
while(length(errlist) < nerr && length(errlist) > 0){
nerr <- length(errlist)
cat("\nTrying to retrieve", length(errlist),
"entries from NCBI (db = protein)")
cc <- 0
for (idx in errlist){
# Try fetching data
tryCatch({
x <- reutils::efetch(uid = uids[idx],
db = "protein",
rettype = "fasta",
retmode = "xml")
reslist[idx] <- XML::xmlToList(x$get_content())
},
warning = function(c) {errk <<- TRUE},
error = function(c) {errk <<- TRUE},
finally = NULL)
if(!is.null(reslist[[idx]]) &
!("ERROR" %in% names(reslist[[idx]]))){
reslist[[idx]]$UID <- uids[idx]
reslist[[idx]]$DB <- "NCBI protein"
}
# Print progress bar
mypb(i = cc, max_i = length(errlist), t0 = t0, npos = 30)
cc <- cc + 1
# save tmp results (if needed)
if(!is.null(save_folder) && !(cc %% 100)){
saveRDS(object = list(reslist = reslist, errlist = errlist,
idx = idx, uids = uids),
file = tmpf)
}
}
errlist <- which(sapply(reslist, function(x) {is.null(x$UID)}))
}
# save tmp results (if needed)
if(!is.null(save_folder)){
saveRDS(object = list(reslist = reslist, errlist = errlist,
idx = idx, uids = uids),
file = tmpf)
}
if (length(errlist) > 0){
# Try retrieving remaining ids from Uniprot
nerr <- Inf
while(length(errlist) < nerr && length(errlist) > 0){
nerr <- length(errlist)
cat("\nTrying to retrieve", length(errlist), "entries from Uniprot")
cc <- 0
for (idx in errlist){
# Try fetching data
errk <- FALSE
tryCatch({
myurl <- paste0("https://www.uniprot.org/uniprot/",
uids[idx], ".fasta")
x <- utils::read.csv(myurl, header = FALSE, sep = ";")
},
warning = function(c) {errk <<- TRUE},
error = function(c) {errk <<- TRUE},
finally = NULL)
if(!errk){
seq <- paste0(x[-1, 1], collapse = "")
#>db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
mystr <- strsplit(paste(x[1, ], collapse = ";"), "|", fixed = TRUE)[[1]][3]
reslist[[idx]]$TSeq_seqtype <- "protein"
reslist[[idx]]$TSeq_accver <- NA
reslist[[idx]]$TSeq_taxid <- strsplit(strsplit(mystr, " OX=")[[1]][2],
" ")[[1]][1]
reslist[[idx]]$TSeq_orgname <- strsplit(strsplit(mystr, " OS=")[[1]][2],
" OX=")[[1]][1]
reslist[[idx]]$TSeq_defline <- strsplit(mystr, " OS=")[[1]][1]
reslist[[idx]]$TSeq_length <- nchar(seq)
reslist[[idx]]$TSeq_sequence <- seq
reslist[[idx]]$UID <- uids[idx]
dbstr <- strsplit(paste(x[1, ], collapse = ";"), "|", fixed = TRUE)[[1]][1]
reslist[[idx]]$DB <- ifelse(dbstr == ">tr",
yes = "UniProtKB/TrEMBL",
no = ifelse(dbstr == ">sp",
yes = "UniProtKB/Swiss-Prot",
no = "Uniprot/other"))
}
# Print progress bar
mypb(i = cc, max_i = length(errlist), t0 = t0, npos = 30)
cc <- cc + 1
}
errlist <- which(sapply(reslist, function(x) {is.null(x$UID)}))
}
}
if(length(errlist) > 0) reslist <- reslist[-errlist]
errlist <- uids[errlist]
df <- data.table::rbindlist(reslist,
use.names = TRUE,
fill = TRUE)
class(df) <- c(class(df), "protein_dt")
# Save results to file
if(!is.null(save_folder)){
saveRDS(object = df, file = df_file)
saveRDS(object = errlist, file = errfile)
if(file.exists(tmpf)) file.remove(tmpf)
}
cat("\nDone!\n", nrow(df), "proteins retrieved.\n",
length(errlist), "retrieval errors.")
invisible(df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.