R/getseq.R

Defines functions get.seq prot2codon id.mapping id.features species.mapping

Documented in get.seq id.features id.mapping prot2codon species.mapping

## ---------- getseq.R ------------ ##
#                                    #
#     get.seq                        #
#     prot2codon                     #
#     id.mapping                     #
#     id.features                    #
#     species.mapping                #
#                                    #
## -------------------------------- ##


## ---------------------------------------------------------------- ##
#      get.seq <- function(id, db = 'uniprot', as.string = TRUE)     #
## ---------------------------------------------------------------- ##
#' Import a Protein Sequence from a Database
#' @description Imports a protein sequence from a selected database.
#' @usage get.seq(id, db = 'uniprot', as.string = TRUE)
#' @param id the identifier of the protein of interest.
#' @param db a character string specifying the desired database; it must be one of 'uniprot', 'metosite', 'ncbi','pdb', 'kegg-aa', 'kegg-nt'.
#' @param as.string logical, if TRUE the imported sequence will be returned as a character string.
#' @details MetOSite and NCBI use the same type of protein ID than UniProt. However, if the chosen database is PDB, the identifier should be the 4-character unique identifier characteristic of PDB, followed by colon and the chain of interest. For instance, '2OCC:B' means we are interested in the sequence of chain B from the structure 2OCC. KEGG used its own IDs (see examples).
#' @return Returns a protein (or nucleotide) sequence either as a character vector or a as a character string.
#' @author Juan Carlos Aledo
#' @examples get.seq('P01009')
#' @examples \dontrun{get.seq("hsa:5265", db = "kegg-aa")}
#' @examples \dontrun{get.seq("1u8f:P", db = "pdb")}
#' @importFrom httr GET
#' @importFrom httr content
#' @export

get.seq <- function(id, db = 'uniprot', as.string = TRUE){

  db <- tolower(db)

  if (db == 'uniprot'){
    baseUrl <- "http://uniprot.org/uniprot/"
    call <- paste(baseUrl, id, ".fasta", sep = "")

  } else if (db == 'metosite'){
    baseUrl <- 'https://metosite.uma.es/api/proteins/scan/'
    call <- paste(baseUrl, id, sep = "")

  } else if (db == 'ncbi'){
    baseUrl <- "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=protein"
    call <- paste(baseUrl, "&val=", id, "&report=fasta", sep = "")

  } else if (db == 'pdb'){
    id <- strsplit(id, split=':')[[1]]
    baseUrl <- "https://www.rcsb.org/pdb/download/viewFastaFiles.do?structureIdList="
    call <- paste(baseUrl, toupper(id[1]), "&compressionType=uncompressed", sep = "")

  } else if (db == 'kegg-aa' | db == 'kegg-nt'){
    t <- paste(strsplit(db, split = '-')[[1]][2], 'seq', sep = '')
    if (requireNamespace('KEGGREST', quietly = TRUE)){
      text <- as.character(KEGGREST::keggGet(id, t))
    } else {
      stop("Please, install the KEGGREST pakage to get this functionality")
    }
    call <- NULL

  } else{
    stop('You should indicate a proper DB')
  }

  ## -------- Client <-> Server Communication -------- ##
  if (!is.null(call)){
    resp <- try(httr::GET(call), silent = TRUE)
    if (inherits(resp, "try-error")) {
      text <- "LOST CONNECTION"
    } else {
      text <- httr::content(resp, "text", encoding = "utf-8")
    }
    retry = 0
    while ((grepl("^LOST CONNECTION", text) || httr::http_error(resp) ||
            grepl("^ERROR", text) || grepl("Nothing has been found",
                                           text)) && retry < 3) {
      retry = retry + 1
      Sys.sleep(5)
      resp <- try(httr::GET(url), silent = TRUE)
      if (inherits(resp, "try-error")) {
        text <- "LOST CONNECTION"
      }
      else {
        text <- httr::content(resp, "text", encoding = "utf-8")
      }
    }
  }


  ## -------------- Parsing the response --------------- ##
  if (text != "LOST CONNECTION"){
    if (db == 'uniprot'){
      seq <- strsplit(text, split = "\\n")[[1]][-1]
      seq <- paste(seq, collapse = "")
    } else if (db == 'metosite'){
      if (requireNamespace("jsonlite", quietly = TRUE)){
        data <- jsonlite::fromJSON(text, flatten = TRUE)
        seq <- data$prot_seq
      } else {
        warning("If you install the package 'jsonlite' this ugly output would be nicely parsed")
        return(text)
      }
    } else if (db == 'pdb'){
        ID <- toupper(id)
        ID <- paste(ID, collapse = ":")
        if (nchar(ID) == 4){ # No chain selector
          seq <- strsplit(text, split = "\n")[[1]]
          seq <- paste(seq[!grepl("^>", seq)], collapse = "")
        } else if (nchar(ID) == 6){ # chain needs to be selected
          seq <- strsplit(text, split = ">")[[1]][-1]
          seq <- seq[grepl(ID, seq)]
          seq <- strsplit(seq, "\n")[[1]][-1]
          seq <- paste(seq, collapse = "")
        } else {
          stop("The PDB ID must be formated properly")
        }
    } else if (db == 'ncbi'){
      seq <- strsplit(text, "\n")[[1]]
      seq <- paste(seq[-1], collapse = "")
    } else if (db == 'kegg-aa' | db == 'kegg-nt'){
      seq <- text
    }

    if (is.null(seq) || is.na(seq)){
      output <- paste("The entry", id, "is not found in the", toupper(db), "database")
    } else {
      output <- seq
      if (!as.string){
        output <- strsplit(seq, split ="")
      }
    }
  } else {
    output <- text
  }

  attr(output, "ID") <- id
  attr(output, "DB") <- db
  return(output)
}


## ---------------------------------------------------------------- ##
#       prot2codon <- function(prot, chain = "", laxity = TRUE)      #
## ---------------------------------------------------------------- ##
#' Find the Coding Triplets for a Given Protein
#' @description Finds the codons corresponding to a given protein.
#' @usage prot2codon(prot, chain = "", laxity = TRUE)
#' @param prot is either a UniProt or PDB id, or the path to a pdb file.
#' @param chain when prot corresponds to a pdb, the chain of interest must be provided.
#' @param laxity logical, if FALSE the program stop when a mismatch between the protein and the gene sequences is detected. Otherwise the program doesn't stop and at the end points out the mismatches.
#' @return Returns a dataframe with as many rows as residues has the protein.
#' @author Juan Carlos Aledo
#' @examples prot2codon('P01009')
#' @examples \dontrun{prot2codon(prot = '1ATU', chain = 'A')}
#' @importFrom bio3d read.pdb
#' @importFrom bio3d aa321
#' @export

prot2codon <- function(prot, chain = "", laxity = TRUE){

  ## ---------- Primary estructure of the protein ----------- ##
  if (regexpr("/",prot)[[1]] != -1){ # input is path to a local pdb file
    mypdb <- bio3d::read.pdb(prot)$atom
    seq <- bio3d::aa321(mypdb$resid[which(mypdb$elety == 'CA' &
                                            mypdb$chain == chain)])
    t <- strsplit(prot, split = "/")[[1]]
    t <- t[length(t)]
    t <- substring(t, 1,4)
    source <- 'pdb'

  } else if (nchar(prot) == 4){ # input is PDB ID
    id <- paste(prot, chain, sep = ":")
    seq <- get.seq(id, db = 'pdb', as.string = FALSE)[[1]]

    t <- prot
    source <- 'pdb'

  } else { # input should be a uniprot ID
    seq <- get.seq(prot, as.string = FALSE)[[1]]
    t <- prot
    source <- 'uniprot'
  }

  ## ------------------- Output structure -------------------- ##
  output <- as.data.frame(matrix(rep(NA, length(seq)*5), ncol = 5))
  names(output) <- c('id', 'chain', 'pos', 'aa', 'codon')
  output$id <- prot
  if (chain == ""){
    output$chain <- NA
  } else {
    output$chain <- chain
  }
  output$pos <- 1:length(seq)
  output$aa <- seq

  ## ----------------- Finding the codons ------------------ ##
  organism <- species.mapping(t, db = source)

  if (source == 'pdb'){
    t <- pdb2uniprot(t, chain = chain)
  }

  kegg_id <- id.mapping(t, from = 'uniprot', to = 'kegg')[1]
  if (kegg_id[1] == "Sorry, no KEGG ID could be found!"){
    output <- "Sorry, we couldn't get a DNA seq from KEGG"
    return(output)
  }
  dna <- get.seq(kegg_id, 'kegg-nt')
  codon <- gsub("(.{3})", "\\1 ", dna)
  codon <- strsplit(codon, split = " ")[[1]]
  if (requireNamespace('seqinr', quietly = TRUE)){
    translated <- seqinr::translate(seqinr::s2c(dna))
  } else {
    stop("The package seqinr must be installed to use this function")
  }

  sequences <- c(paste(seq, collapse = ""),
                 paste(translated, collapse = ""))
  aln <- msa(sequences)

  k <- 1
  j <- 1
  for (i in 1:dim(aln$ali)[2]){
    if (aln$ali[1,i] == aln$ali[2,i]){
      output$codon[k] <- codon[j]
      k <- k + 1
      j <- j + 1
    } else if (aln$ali[1,i] == '-'){
      j <- j +1
    } else if (aln$ali[2,i] == '-'){
      output$codon[k] <- NA
      k <- k + 1
    } else { # mismatch
      output$codon[k] <- codon[j]
      k <- k + 1
      j <- j + 1
    }
  }

  ## ------------ Cheking the translation -------------------- ##
  triplet <- c("GCU", "GCC", "GCA", "GCG", "CGU", "CGC", "CGA", "CGG", "AGA", "AGG", "AAU", "AAC",
               "GAU", "GAC", "UGU", "UGC", "CAA", "CAG", "GAA", "GAG", "GGU", "GGC", "GGA", "GGG",
               "CAU", "CAC", "AUU", "AUC", "AUA", "UUA", "UUG", "CUU", "CUC", "CUA", "CUG", "AAA",
               "AAG", "AUG", "UUU", "UUC", "CCU", "CCC", "CCA", "CCG", "UCU", "UCC", "UCA", "UCG",
               "AGU", "AGC", "ACU", "ACC", "ACA", "ACG", "UGG", "UAU", "UAC", "GUU", "GUC", "GUA",
               "GUG", "UAA", "UGA", "UAG")
  aa <- c("A","A","A","A","R","R","R","R","R","R","N","N","D","D","C","C","Q","Q","E","E",
          "G","G","G","G","H","H","I","I","I","L","L","L","L","L","L","K","K","M","F","F",
          "P","P","P","P","S","S","S","S","S","S","T","T","T","T","W","Y","Y","V","V","V",
          "V","Stop","Stop","Stop")
  names(triplet) <- aa
  triplet <- gsub("U", "T", triplet)

  output$check <- NA
  for (i in 1:nrow(output)){
    if (!is.na(output$codon[i])){
      t <- as.character(output$codon[i])
      if (!laxity){
        if (output$aa[i] != names(triplet)[which(triplet == t)]){
          stop(paste("Problem at ", i))
        }
      }
      output$check[i] <- (output$aa[i] == names(triplet)[which(triplet == t)])
    }

  }
  if (!laxity){
    if (sum(output$check) != nrow(output)) { stop("Translation problem")}
  }
  return(output)
}

## ---------------------------------------------------------------- ##
#               id.mapping <- function(id, from, to)                 #
## ---------------------------------------------------------------- ##
#' Identifier Mapping
#' @description Mapping between protein identifiers.
#' @usage id.mapping(id, from, to)
#' @param id the identifier to be converted.
#' @param from the type for the identifier of origin; it must be one of 'uniprot', 'pdb', or 'kegg'.
#' @param to the type for the identifier of destination; it must be one of 'uniprot', 'pdb', or 'kegg'.
#' @return Returns a character string corresponding to the requested identifier.
#' @author Juan Carlos Aledo
#' @examples id.mapping('P01009', from = 'uniprot', to = 'pdb')
#' @seealso pdb2uniprot(), uniprot2pdb()
#' @importFrom httr GET
#' @importFrom httr content
#' @importFrom httr add_headers
#' @export

id.mapping <- function(id, from, to){

  ## ----- From KEGG to UniProt ------------- ##
  kegg.uniprot <- function(id){
    if (requireNamespace("KEGGREST", quietly = TRUE)){
      output <- KEGGREST::keggConv('uniprot', id)
    } else {
      stop("The package KEGGREST must be installed to get this mapping")
    }
    output <- substr(output, 4, nchar(output))
    return(output)
  }
  ## ----- From UniProt to KEGG ------------- ##
  uniprot.kegg <- function(id){
    if (requireNamespace("KEGGREST", quietly = TRUE)){
      sp <- species.mapping(id)
      organisms <- NULL
      load(url("https://github.com/jcaledo/kegg_species/blob/master/organisms.Rda?raw=true"))
      org <- organisms$organism[which(regexpr(sp, organisms$species) != -1)[1]]
      output <- KEGGREST::keggConv(org, paste('uniprot:', id, sep = ""))
    } else {
      stop("The package KEGGREST must be installed to get this mapping")
    }
    return(output)
  }

  uniprot_url <- "http://www.uniprot.org/uploadlists/"
  my_headers <- httr::add_headers('User-Agent' = paste('R', 'metosite@uma.es'))

  ## ----- From UniProt to PDB ------------- ##
  uniprot.pdb <- function(id){
    params <- list(from = 'ACC',
                   to = 'PDB_ID',
                   format = 'tab',
                   query = id)
    request <- try(httr::GET(uniprot_url, query = params, my_headers),
                   silent = TRUE)
    if (inherits(request, "try-error")) {
      ans <- "LOST CONNECTION"
    } else {
      ans <- httr::content(request, 'text', encoding = "ISO-8859-1")
    }
    retry = 0
    while ((grepl("^LOST CONNECTION", ans) || httr::http_error(request) ||
            grepl("^ERROR", ans) || grepl("Nothing has been found",
                                          ans)) && retry < 3) {
      retry = retry + 1
      Sys.sleep(5)
      request <- try(httr::GET(uniprot_url, query = params, my_headers),
                     silent = TRUE)
      if (inherits(request, "try-error")) {
        ans <- "LOST CONNECTION"
      }
      else {
        ans <- httr::content(request, 'text', encoding = "ISO-8859-1")
      }
    }

    pos <- gregexpr(id, ans)[[1]]
    output <- character(length(pos))
    for (i in seq_len(length(pos))){
      n_id <- (substr(ans, pos[i]+nchar(id)+1, pos[i]+nchar(id)+4))
      output[i] <- n_id
    }
    return(output)
  }

  ## ----- From PDB to UniProt ------------- ##
  pdb.uniprot <- function(id){
    params <- list(from = 'PDB_ID',
                   to = 'ACC',
                   format = 'tab',
                   query = id)
    request <- try(httr::GET(uniprot_url, query = params, my_headers),
                   silent = TRUE)
    if (inherits(request, "try-error")) {
      ans <- "LOST CONNECTION"
    } else {
      ans <- httr::content(request, 'text', encoding = "ISO-8859-1")
    }
    retry = 0
    while ((grepl("^LOST CONNECTION", ans) || httr::http_error(request) ||
            grepl("^ERROR", ans) || grepl("Nothing has been found",
                                          ans)) && retry < 3) {
      retry = retry + 1
      Sys.sleep(5)
      request <- try(httr::GET(uniprot_url, query = params, my_headers),
                     silent = TRUE)
      if (inherits(request, "try-error")) {
        ans <- "LOST CONNECTION"
      }
      else {
        ans <- httr::content(request, 'text', encoding = "ISO-8859-1")
      }
    }


    temp <- strsplit(ans, split = "\n")[[1]][-1]
    output <- character(length(temp))
    for (i in seq_len(length(temp))){
      t <- strsplit(temp[i], split = "\t")[[1]][2]
      output[i] <- t

    }
    return(output)
  }

  ## -------- Mapping using the above functions --------- ##
  if (from == 'uniprot' & to == 'pdb'){
    output <- uniprot.pdb(id)
    if (output[1] == "" | output[1] == "To\n"){
      return("No PDB found")
    } else {
      return(output)
    }
  } else if (from == 'uniprot' & to == 'kegg'){
    output <- uniprot.kegg(id)
    if (identical(output, character(0))){
      output <- 'Sorry, no KEGG ID could be found!'
    }
    return(output)
  } else if (from == 'kegg' & to == 'uniprot' ){
    output <- kegg.uniprot(id)
    return(output)
  } else if (from == 'kegg' & to == 'pdb'){
    output <- kegg.uniprot(id)
    output <- uniprot.pdb(output[1])
    return(output)
  } else if (from == 'pdb' & to == 'uniprot'){
    output <- pdb.uniprot(id)
    return(output)
  } else if (from == 'pdb' & to == 'kegg'){
    output <- pdb.uniprot(id)
    output <- lapply(output, uniprot.kegg)
    if (identical(output, character(0))){
      output <- 'Sorry, no KEGG ID could be found!'
    }
    attr(output, 'from') <- from
    attr(output, 'to') <- to
    return(output)
  }
}


## ---------------------------------------------------------------- ##
#           id.features <- function(id, features = "")               #
## ---------------------------------------------------------------- ##
#' Features Related to the Protein Entry
#' @description Obtains features related to the provided id.
#' @usage id.features(id, features = "")
#' @param id the UniProt identifier of the protein of interest.
#' @param features a string identifying the features (comma separated) to be recovered.
#' @details By default the the function provides info regarding the following features: id, reviewed, entry name and organism. If wished, this list of features can be expanded using the argument 'features'. There is a larga list of features that can be retrieved. You can look up your relevant feature's name in the full list of UniProtKB found at https://www.uniprot.org/help/uniprotkb_column_names.
#' @return Returns a named list with the requested features.
#' @author Juan Carlos Aledo
#' @examples \dontrun{id.features('P04406', features = 'ec,keywords,database(PDB)')}
#' @seealso species.mapping()
#' @importFrom httr GET
#' @importFrom httr content
#' @importFrom httr add_headers
#' @export

id.features <- function(id, features = "" ){

  ## -------- Building the query ---------- ##
  col <- 'id,reviewed,entry name,organism'
  if (features != ""){
    features <- gsub(', ', ',', features)
    col <- paste(col, features, sep = ',')
  }
  base_url <- "http://www.uniprot.org/uploadlists/"
  my_headers <- httr::add_headers('User-Agent' = paste('R', 'metosite@uma.es'))
  params <- list(from = 'ACC+ID',
                 to = 'ACC',
                 format = 'tab',
                 columns = col,
                 query = id)

  ## --------- Sending the query ----------- ##
  request <- try(httr::GET(base_url, query = params, my_headers),
                 silent = TRUE)
  if (inherits(request, "try-error")) {
    text <- "LOST CONNECTION"
  } else {
    cont <- httr::content(request, "text", encoding = "ISO-8859-1")
  }
  retry = 0
  while ((grepl("^LOST CONNECTION", cont) || httr::http_error(request) ||
          grepl("^ERROR", cont) || grepl("Nothing has been found",
                                         cont)) && retry < 3) {
    retry = retry + 1
    Sys.sleep(5)
    request <- try(httr::GET(base_url), silent = TRUE)
    if (inherits(request, "try-error")) {
      cont <- "LOST CONNECTION"
    }
    else {
      cont <- httr::content(request, "text", encoding = "ISO-8859-1")
    }
  }

  ## ------- Parsing the response ---------- ##
  l <- strsplit(cont, split = "\n")[[1]]
  headers <- strsplit(l, split = "\t")[[1]]
  headers <- gsub(" ", "_", headers)
  output <- as.list(strsplit(l[2], split = "\t")[[1]])
  names(output) <- gsub(" ", "_", headers)
  return(output)
}


## ---------------------------------------------------------------- ##
#           species.mapping <- function(id, db = 'uniprot')          #
## ---------------------------------------------------------------- ##
#' Map Protein ID to Species
#' @description Maps a protein ID to its corresponding organism.
#' @usage species.mapping(id, db = 'uniprot')
#' @param id the identifier of the protein of interest.
#' @param db a character string specifying the corresponding database. Currently, only 'uniprot' or 'pdb' are valid options.
#' @return Returns a character string identifying the organism to which the given protein belong.
#' @author Juan Carlos Aledo
#' @examples species.mapping('P01009')
#' @seealso id.features()
#' @importFrom httr GET
#' @importFrom httr content
#' @export

species.mapping <- function(id, db = 'uniprot'){

  db <- tolower(db)
  if (!(db %in% c("uniprot", "pdb"))){
    stop("Option db should be one of uniprot or pdb")
  }
  if (db == 'pdb'){
    id <- id.mapping(id, from = 'pdb', to = 'uniprot')[1]
  }

  url <- paste("https://www.uniprot.org/uniprot/", id, ".fasta", sep = "")
  resp <- try(httr::GET(url), silent = TRUE)
  if (inherits(resp, "try-error")) {
    text <- "LOST CONNECTION"
  } else {
    text <- httr::content(resp, "text", encoding = "utf-8")
  }
  retry = 0
  while ((grepl("^LOST CONNECTION", text) || httr::http_error(resp) ||
          grepl("^ERROR", text) || grepl("Nothing has been found",
                                         text)) && retry < 3) {
    retry = retry + 1
    Sys.sleep(5)
    resp <- try(httr::GET(url), silent = TRUE)
    if (inherits(resp, "try-error")) {
      text <- "LOST CONNECTION"
    }
    else {
      text <- httr::content(resp, "text", encoding = "utf-8")
    }
  }

  start <- gregexpr("OS=", text)[[1]]
  stop <- gregexpr("OX=", text)[[1]]
  species <- substr(text, start+3, stop-2)

  return(species)
}
jcaledo/ptm_0.1.1 documentation built on April 4, 2020, 3:48 a.m.