R/readUniProtExport.R
In wrProteo: Proteomics Data Analysis Functions

Documented in readUniProtExport

#' Read protein annotation as exported from UniProt batch-conversion
#'
#' This function allows reading and importing protein-ID conversion results from \href{https://www.uniprot.org/id-mapping/}{UniProt}.
#' To do so, first copy/paste your query IDs into \href{https://www.uniprot.org/id-mapping/}{UniProt} 'Retrieve/ID mapping' field called '1. Provide your identifiers' (or upload as file), verify '2. Select options'.
#' In a typical case of 'enst000xxx' IDs  you may leave default settings, ie 'Ensemble Transcript' as input and 'UniProt KB' as output. Then, 'Submit' your search and retreive results via 
#' 'Download', you need to specify a 'Tab-separated' format ! If you download as 'Compressed' you need to decompress the .gz file before running the function \code{readUCSCtable} 
#' In addition, a file with UCSC annotation (Ensrnot accessions and chromosomic locations, obtained using \code{\link{readUCSCtable}}) can be integrated.
#' @details
#' In a typicall use case, first chromosomic location annotation is extracted from UCSC for the species of interest and imported to R using  \code{\link{readUCSCtable}} . 
#' However, the tables provided by UCSC don't contain Uniprot IDs. Thus, an additional (batch-)conversion step needs to get added. 
#' For this reason \code{\link{readUCSCtable}} allows writing a file with Ensemble transcript IDs which can be converted tu UniProt IDs at the site of  \href{https://www.uniprot.org/id-mapping/}{UniProt}. 
#' Then, UniProt annotation (downloaded as tab-separated) can be imported and combined with the genomic annotation using this function.  
#' @param UniProtFileNa (character) name (and path) of file exported from Uniprot (tabulated text file inlcuding headers) 
#' @param deUcsc (data.frame) object produced by \code{readUCSCtable} to be combined with data from \code{UniProtFileNa}
#' @param targRegion (character or list) optional marking of chromosomal locations to be part of a given chromosomal target region, 
#'   may be given as character like \code{chr11:1-135,086,622} or as \code{list} with a first component characterizing the chromosome and a integer-vector with start- and end- sites 
#' @param useUniPrCol (character) optional declaration which colums from UniProt exported file should be used/imported (default 'EnsID','Entry','Entry.name','Status','Protein.names','Gene.names','Length').
#' @param silent (logical) suppress messages
#' @param debug (logical) display additional messages for debugging
#' @param callFrom (character) allow easier tracking of message(s) produced
#' @return This function returns a data.frame (with columns $EnsID, $Entry, $Entry.name, $Status, $Protein.names, $Gene.names, $Length; if \code{deUcsc} is integrated plus: $chr, $type, $start, $end, $score, $strand, $Ensrnot, $avPos) 
#' @seealso \code{\link{readUCSCtable}}
#' @examples
#' path1 <- system.file("extdata",package="wrProteo")
#' deUniProtFi <- file.path(path1,"deUniProt_hg38chr11extr.tab")
#' deUniPr1a <- readUniProtExport(deUniProtFi) 
#' str(deUniPr1a)
#' 
#' ## Workflow starting with UCSC annotation (gtf) files :
#' gtfFi <- file.path(path1,"UCSC_hg38_chr11extr.gtf.gz")
#' UcscAnnot1 <- readUCSCtable(gtfFi)
#' ## Results of conversion at UniProt are already available (file "deUniProt_hg38chr11extr.tab")
#' myTargRegion <- list("chr1", pos=c(198110001,198570000))
#' myTargRegion2 <-"chr11:1-135,086,622"      # works equally well
#' deUniPr1 <- readUniProtExport(deUniProtFi,deUcsc=UcscAnnot1,
#'   targRegion=myTargRegion)
#' ## Now UniProt IDs and genomic locations are both available :
#' str(deUniPr1)
#' @export
readUniProtExport <- function(UniProtFileNa, deUcsc=NULL, targRegion=NULL, useUniPrCol=NULL, silent=FALSE, debug=FALSE, callFrom=NULL) {         
  ## read annotation exported from https://www.uniprot.org/id-mapping/ (was https://www.uniprot.org/uploadlists/) upload  Ensemble Transcript => UniprotKB => export 
  ## targRegion : list('chr1',pos=c(198110001,198570000)) or 'chr11:1-135,086,622'
  fxNa <- wrMisc::.composeCallName(callFrom,newNa="readUniProtExport")
  if(!isTRUE(silent)) silent <- FALSE
  if(isTRUE(debug)) silent <- FALSE else debug <- FALSE
  if(length(UniProtFileNa) >1) UniProtFileNa <- UniProtFileNa[1] else {if(length(UniProtFileNa) < 1) stop(" argument 'UniProtFileNa' seems empty")}
  chFi <- file.exists(UniProtFileNa)
  if(!chFi) stop(" file '",UniProtFileNa,"' not found !")
  chExt <- length(grep("\\.gz$", UniProtFileNa, fixed=FALSE, perl=FALSE)) >0  
  chPa <- try(find.package("utils"),silent=TRUE)
  if(inherits(chPa, "try-error")) stop("Package 'utils' not found ! Please install first")   
  ## main  
  deUniProt <- try(utils::read.delim(UniProtFileNa,stringsAsFactors=FALSE), silent=TRUE)
  errMsg1 <- " seems not to be in UniProt 'tab-separated' format (does not contain sufficent number of columns) !"
  if(inherits(deUniProt, "try-error")) { 
    deUniProt <- try(wrMisc::readVarColumns(if(chExt) unz(UniProtFileNa) else UniProtFileNa,callFrom=fxNa), silent=TRUE)
    if(inherits(deUniProt, "try-error")) stop("Can't read file '",UniProtFileNa,"' - please check format !") else {
      if(!silent) message(fxNa,"Managed to read file using readVarColumns()") }
    if(ncol(deUniProt) <9) stop("file ",UniProtFileNa,errMsg1)  
    colnames(deUniProt)[1:9] <- c("EnsTraID","xx","UniprotID",colnames(deUniProt)[c(2:7)])  # initial colnames by readVarColumns are shifted    
  } 
  if(ncol(deUniProt) <7) stop("file ",UniProtFileNa,errMsg1)     # check if (in)sufficient numer of columns
  if(nrow(deUniProt) <2 && !silent) message(fxNa," CAUTION, file '",UniProtFileNa,"' contains only ",nrow(deUniProt)," lines !") 
  ## correct colnames
  chCol <- c(grep("yourlist.",colnames(deUniProt)[1]) >0, grep("isomap.",colnames(deUniProt)[2]) >0, "Entry" %in% colnames(deUniProt))
  if(chCol[1]) colnames(deUniProt)[1] <- "EnsTraID"
  if(chCol[2]) colnames(deUniProt)[2] <- "xx"                 # this column contains almost no information
  colnames(deUniProt)[3] <- "UniProtID"  
  ## combine with data initially/previously read from Ucsc
  multID <- NULL
  colnames(deUniProt) <- sub(" ",".",colnames(deUniProt))
  if(length(useUniPrCol) <1) useUniPrCol <- c("EnsTraID","UniProtID","Entry.name","Status","Protein.names","Gene.names","Length")
  useUniPrCo <- wrMisc::extrColsDeX(deUniProt, useUniPrCol, doExtractCols=FALSE, callFrom=fxNa, silent=silent)
  ##  treat multi-Ensemble entries : need to replicate lines of table for multiple concatenated (eg ENSRNOT00000031808,ENSRNOT00000093745)
  splitExtendConcat <- function(mat,useCol=1,sep=",",sep2="[[:digit:]],[[:alpha:]]+"){
    ## extend matrix or data.frame by additional lines if column 'useCol' contains multiple concatenated terms (content of other columns will be duplicated)
    ## 'sep' used with strsplit() and grep() to identify lines and split, also used to construct (generic) term for keeping just first 
    ## 'sep2' optional custom pattern used with grep() to identify lines; will be used instead of 'generic' sep to identify entries to split lateron 
    ## main
    chMult <- grep(if(length(sep2) >0) sep2 else sep, mat[,useCol], fixed=FALSE, perl=FALSE)
    if(length(chMult) >0)  {
      ##
      spl1 <- strsplit(mat[chMult,useCol],sep, fixed=FALSE, perl=FALSE)
      spl2 <- unlist(lapply(spl1, function(x) x[-1]), use.names=FALSE)
      toLine <- rep(chMult, sapply(spl1,length) -1)
      mat[,useCol] <- sub(paste(sep,"[[:print:]]*$",sep=""),"",mat[,useCol], fixed=FALSE, perl=FALSE)
      mat2 <-  cbind(spl2,mat[c(toLine),-1])
      colnames(mat2)[1] <- colnames(mat)[1]
      mat <- rbind(mat,mat2)
    }
    mat }
  deUniProt <- splitExtendConcat(deUniProt, sep=",", sep2="[[:digit:]],[[:upper:]]+")   
  if(length(deUcsc) >0) {
    chGeneId <- which(colnames(deUcsc) =="gene_id")
    if(length(chGeneId) <1) stop("Invalid file-content: The file '",UniProtFileNa,"' does not conatain a column 'gene_id' ! Please check the input file")
    deUcsc[,"gene_id"] <- sub("\\.[[:digit:]]+$","",deUcsc[,"gene_id"])
    useUcCol <- wrMisc::naOmit(match(c("gene_id","chr","start","end","strand","frame"),colnames(deUcsc)))
    deUcsc <- wrMisc::convMatr2df(deUcsc[,useUcCol], addIniNa=FALSE, callFrom=fxNa,silent=silent)    
    matchUniprInUcsc <- match(deUniProt[,1], deUcsc[,"gene_id"])
    if(sum(!is.na(matchUniprInUcsc)) <4) { 
      if(!silent) message(fxNa," low yield matching ",wrMisc::pasteC(deUniProt[1:3,1],quoteC="'")," and ",
        wrMisc::pasteC(deUcsc[1:3,"gene_id"],quoteC="'"), " convert all to lower case and remove version numbers ('xxx.2') for better matching")    
      matchUniprInUcsc <- match(sub("\\.[[:digit:]]+$","", tolower(deUniProt[,1])), sub("\\.[[:digit:]]+$","", tolower(deUcsc[,"gene_id"])))
      if(sum(!is.na(matchUniprInUcsc)) <4) warning(fxNa," Matching failed : Very few or no matches between UniProtFile and deUcsc !")} 
    if(!silent) message(fxNa," intergrating genomic information for ",length(matchUniprInUcsc)," entries (",sum(is.na(matchUniprInUcsc))," not found)")
    ## add chrom Loc to deUniProt => combined DB
    combAllChrDB <- cbind(deUniProt[,useUniPrCo], deUcsc[matchUniprInUcsc,])       ## add Ensrnot   c(1,3:5,7,10)
    if(!silent) message(fxNa," ",nrow(combAllChrDB)," IDs in output")
    combAllChrDB <- cbind(combAllChrDB,avPos=if(all(c("start","end") %in% colnames(combAllChrDB))) {
      round(rowMeans(combAllChrDB[,c("start","end")])) } else NA)    # add mean gene-position for easier sorting
    ## mark if genimic positions in targer region
    if(!all(c("chr","start") %in% colnames(combAllChrDB))) targRegion <- NULL
    if(length(targRegion) >0) if(is.character(targRegion) && length(targRegion) ==1) {
      targRegion <- unlist(strsplit(targRegion,":"))
      targRegion <- list(targRegion[1],gsub(",","",unlist(strsplit(targRegion[2],"-")))) }
    combAllChrDB <- cbind(combAllChrDB,inTarg=if(length(targRegion) >0) {
      combAllChrDB[,"chr"]==targRegion[[1]] & as.integer(combAllChrDB[,"start"]) >targRegion[[2]][1] & as.integer(combAllChrDB[,"end"]) <targRegion[[2]][2]} else NA)
  } else combAllChrDB <- deUniProt[,useUniPrCo]
  ## convert factor-columns to character
  chFa <- rep(NA,ncol(combAllChrDB))
  for(i in 1:ncol(combAllChrDB)) chFa[i] <- is.factor(combAllChrDB[,i])
  if(any(chFa)) for(i in which(chFa)) combAllChrDB[,i] <- as.character(combAllChrDB[,i])
  chEnsID <- "gene_id" %in% colnames(combAllChrDB)
  if(chEnsID)  combAllChrDB <- combAllChrDB[,-1*which(colnames(combAllChrDB)=="gene_id")]
  combAllChrDB }