R/PDBinfGet.R

Defines functions idTranslater uniprot2pdb PDBinfGet

Documented in PDBinfGet

#' Get all available PDB_ID information of the genelist you input
#'
#' @description we acquire information of PDB_ID in virtue of the api of
#' EMBL protein(https://www.ebi.ac.uk/proteins/api/proteins/),you just need
#' to input a vector of genelist in your hand.
#' @param genelist a vector of gene(s)
#' @param show.length determines whether the length is displayed,default FALSE
#' @param id.fromType input id type(must be same as type of input genelist),
#' we set 4 frequently-used types here,
#' which are "hgnc_symbol","entrezgene_id","ensembl_gene_id","uniprot_gn_id"
#' @param id.biomart biomart types when id needs to be translated,
#' see also \code{\link{getBM}}
#' @param id.dataset biomart dataset choose,
#' see also \code{\link{getBM}}
#' @param id.toType output id type
#'
#' @return properties of PDB_ID,such as method of Protein crystal analysis,
#' resolution,chains,structure length
#' @import stringr httr jsonlite tidyverse
#' biomaRt progress tidyr
#' @importFrom crayon blue underline
#' @export PDBinfGet
#' @usage
#' PDBinfGet(genelist,
#'           show.length = FALSE,
#'           id.fromType = "hgnc_symbol",
#'           id.biomart  = ENSEMBL_MART_ENSEMBL,
#'           id.dataset  = "hsapiens_gene_ensembl",
#'           id.toType   = "uniprot_gn_id"
#' @author Ji Yang
#'
#' @examples
#' #input type is "hgnc_symbol":
#' PDBinfGet(genelist    = vertex_sample[1:5,1],
#'           show.length = FALSE,
#'           id.fromType = "hgnc_symbol",
#'           id.biomart  = ENSEMBL_MART_ENSEMBL,
#'           id.dataset  = "hsapiens_gene_ensembl",
#'           id.toType   = "uniprot_gn_id")
#' #input type is "uniprot_gn_id":
#' PDBinfGet(genelist    = "P21580",
#'           show.length = FALSE,
#'           id.fromType = "uniprot_gn_id",
#'           id.biomart  = ENSEMBL_MART_ENSEMBL,
#'           id.dataset  = "hsapiens_gene_ensembl",
#'           id.toType   = "uniprot_gn_id")
#'
PDBinfGet<-function(genelist,
                    show.length = FALSE,
                    id.fromType = "hgnc_symbol",
                    id.biomart  = ENSEMBL_MART_ENSEMBL,
                    id.dataset  = "hsapiens_gene_ensembl",
                    id.toType   = "uniprot_gn_id"){
  #genelist<-t(genelist)
  pdb_all<-data.frame()
  pdb_null<-character()
  num_null = 0
  num_pdb = 0
  i=0
  #choose which ways to continue,"uniprot_gn_id" or other idtypes
  if (id.fromType == "uniprot_gn_id"){
    cat("input type:uniprot_gn_id, doesn't have to be converted","\n")
    pb <- progress_bar$new(
          format = "completed percent[:bar] :percent time remaining: :eta",
          total  = length(genelist), clear = FALSE, width= 100)
    for (queryid in genelist) {
         i=i+1
         pb$tick()
         pdb_inf<-uniprot2pdb(queryid,showlength = show.length)
         if (length(pdb_inf) == 0){
            num_null<-num_null+1
            pdb_null[num_null]<-queryid
            next;
         } else{
            pdb_all<-rbind(pdb_all,pdb_inf)
            pdb_all<-cbind(queryid,pdb_all)
           }
    }
  } else{
       #set progress bar
       pb <- progress_bar$new(
             format = "completed percent[:bar] :percent time remaining: :eta",
             total = length(genelist), clear = FALSE, width= 100)
          for (queryid in genelist) {
              i=i+1
              pb$tick()
              unipid <- idTranslater(genelist = queryid,
                                     biomart  = id.biomart,
                                     dataset  = id.dataset,
                                     fromType = id.fromType,
                                     toType   = id.toType)
            for (id in unipid) {
              pdb_inf<- uniprot2pdb(id,showlength = show.length)
              if (length(pdb_inf) == 0){
                next;
              } else{
                pdb_inf<-cbind(queryid,pdb_inf)
                pdb_all<-rbind(pdb_all,pdb_inf)
                num_pdb<-num_pdb+1
                }
            }
          }
        if (length(pdb_all)==0 && length(pdb)==0) {
          num_null<-num_null+1
          pdb_null[num_null]<-queryid
        }
      }
    cat(blue(paste("you submitted",length(genelist),"queryid(s),",
             num_null,"queryid(s) have no PDB_ID now.","\n")))
    if (!num_null == 0){
      cat(paste("queryid(s) with no PDB_ID are:",underline(paste(pdb_null,collapse = ", "))))
    }
  return(pdb_all)
}

#this function is setted for translating Uniprot_ID to PDB_ID,
#and search for properties of PDB_ID
uniprot2pdb<-function(Uniprot_ID,
                      showlength = show.length){
  url<-paste0("https://www.ebi.ac.uk/proteins/api/proteins/",Uniprot_ID)
  html<-GET(url)
  pdb_content<-httr::content(html,type = 'text',
                       encoding = "UTF-8")%>%
    fromJSON()
  #build colnames
  pdb_content<-pdb_content$dbReferences
  pdb_properties<-pdb_content[pdb_content$type=="PDB","properties"]
  if (length(rownames(pdb_properties)) == 0) {
    num_pdb <- data.frame()
    return(num_pdb)
  } else{
      pdb_inf<-cbind(pdb_content[pdb_content$type=="PDB","id"],
                   pdb_properties[,c("method","chains","resolution")])
      names(pdb_inf)<-c("PDB_ID","method","chains","resolution")
      #determines whether the length is displayed
      if (showlength){
        #calculated length of structures
        pdb_inf<-separate(data = pdb_inf,
                          col  = chains,
                          into = c("chains", "positions"),
                          sep  = "=")
        pdb_inf<-separate(data = pdb_inf,
                          col  = positions,
                          into = c("from", "to"),
                          sep  = "-",
                          remove = FALSE)
        pdb_length<-as.data.frame(lapply(pdb_inf[,c("from","to")],as.numeric))
        pdb_inf$length<-pdb_length$to-pdb_length$from+1
      }
      #combine pdb_inf with pdb_all
      pdb_inf<-cbind(Uniprot_ID,pdb_inf)
      return(pdb_inf)
   }
}

#this function is setted for translating id types
idTranslater<-function(genelist,
                       biomart  = id.biomart,
                       dataset  = id.dataset,
                       fromType = id.fromType,
                       toType   = id.toType){
#listMarts()
#list all biomart types
#         biomart                version
# 1 ENSEMBL_MART_ENSEMBL      Ensembl Genes 103
# 2   ENSEMBL_MART_MOUSE      Mouse strains 103
# 3     ENSEMBL_MART_SNP  Ensembl Variation 103
# 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 103

#mart = useMart('ensembl')
#listDatasets(mart)
#list all dataset types
ensembl = useMart("ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl")
ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
listGeneSymol<- getBM(attributes = c("hgnc_symbol","entrezgene_id","ensembl_gene_id","uniprot_gn_id"),
                      filters    = "hgnc_symbol",
                      values     = genelist,
                      mart       = ensembl)
unipid<-unique(listGeneSymol[listGeneSymol$hgnc_symbol%in%genelist,toType])
return(unipid)
}
windforclouds/PSV2N documentation built on Dec. 23, 2021, 5:15 p.m.