R/addGeneIDs.R

Defines functions addGeneIDs

Documented in addGeneIDs

#' Add common IDs to annotated peaks such as gene symbol, entrez ID, 
#' ensemble gene id and refseq id.
#' @description Add common IDs to annotated peaks such as gene symbol, 
#' entrez ID, ensemble gene id and refseq id leveraging organism annotation 
#' dataset. For example, org.Hs.eg.db is the dataset from orgs.Hs.eg.db 
#' package for human, while org.Mm.eg.db is the dataset from the org.Mm.eg.db
#' package for mouse.
#' @param annotatedPeak GRanges or a vector of feature IDs.
#' @param orgAnn organism annotation dataset such as org.Hs.eg.db.
#' @param IDs2Add a vector of annotation identifiers to be added
#' @param feature_id_type type of ID to be annotated, default is 
#' ensembl_gene_id
#' @param silence TRUE or FALSE. If TRUE, will not show unmapped entrez id
#' for feature ids.
#' @param mart mart object, see \link[biomaRt:useMart]{useMart} of biomaRt
#' package for details
#' @details One of orgAnn and mart should be assigned. 
#' \itemize{    
#'   \item If orgAnn is given, parameter feature_id_type should be 
#'   ensemble_gene_id, entrez_id, gene_symbol, gene_alias or refseq_id.
#'   And parameter IDs2Add can be set to any combination of identifiers 
#'   such as "accnum", "ensembl", "ensemblprot", "ensembltrans", "entrez_id",
#'   "enzyme", "genename", "pfam", "pmid", "prosite", "refseq", "symbol", 
#'   "unigene" and "uniprot". Some IDs are unique to an organism, 
#'   such as "omim" for org.Hs.eg.db and "mgi" for org.Mm.eg.db.
#'   
#'   Here is the definition of different IDs : 
#'     \itemize{
#'       \item accnum: GenBank accession numbers
#'       \item ensembl: Ensembl gene accession numbers
#'       \item ensemblprot: Ensembl protein accession numbers
#'       \item ensembltrans: Ensembl transcript accession numbers
#'       \item entrez_id: entrez gene identifiers
#'       \item enzyme: EC numbers
#'       \item genename: gene name
#'       \item pfam: Pfam identifiers
#'       \item pmid: PubMed identifiers
#'       \item prosite: PROSITE identifiers
#'       \item refseq: RefSeq identifiers
#'       \item symbol: gene abbreviations
#'       \item unigene: UniGene cluster identifiers
#'       \item uniprot: Uniprot accession numbers
#'       \item omim: OMIM(Mendelian Inheritance in Man) identifiers
#'       \item mgi: Jackson Laboratory MGI gene accession numbers
#'     }
#'   
#'   \item If mart is used instead of orgAnn, for valid parameter 
#'   feature_id_type and IDs2Add parameters, please refer to 
#'   \link[biomaRt:getBM]{getBM} in bioMart package. 
#'   Parameter feature_id_type should be one valid filter name listed by 
#'   \link[biomaRt:listFilters]{listFilters(mart)} such as ensemble_gene_id.
#'   And parameter IDs2Add should be one or more valid attributes name listed 
#'   by \link[biomaRt:listAttributes]{listAttributes(mart)} such as 
#'   external_gene_id, entrezgene, wikigene_name, or mirbase_transcript_name.
#'   
#' }
#' @return GRanges if the input is a GRanges or dataframe if input is a vector.
#' @references http://www.bioconductor.org/packages/release/data/annotation/
#' @author Jianhong Ou, Lihua Julie Zhu
#' @seealso \link[biomaRt:getBM]{getBM}, AnnotationDb
#' @export
#' @importFrom AnnotationDbi mget
#' @importFrom biomaRt getBM
#' @importFrom utils installed.packages
#' @examples
#' data(annotatedPeak)
#' library(org.Hs.eg.db)
#' addGeneIDs(annotatedPeak[1:6,],orgAnn="org.Hs.eg.db",
#'            IDs2Add=c("symbol","omim"))
#' ##addGeneIDs(annotatedPeak$feature[1:6],orgAnn="org.Hs.eg.db",
#' ##           IDs2Add=c("symbol","genename"))
#' if(interactive()){
#'   mart <- useMart("ENSEMBL_MART_ENSEMBL",host="www.ensembl.org",
#'                   dataset="hsapiens_gene_ensembl")
#'   ##mart <- useMart(biomart="ensembl",dataset="hsapiens_gene_ensembl")
#'   addGeneIDs(annotatedPeak[1:6,], mart=mart,
#'              IDs2Add=c("hgnc_symbol","entrezgene"))
#' }
#' @keywords misc

addGeneIDs<-function(annotatedPeak, orgAnn, IDs2Add=c("symbol"), 
                     feature_id_type="ensembl_gene_id",
                     silence=TRUE, 
                     mart){
    if (missing(annotatedPeak))
    {
        stop("Missing required argument annotatedPeak!",
             call.=FALSE)
    }
    if(missing(orgAnn) & missing(mart)){
        stop('no annotation database selected',
             call.=FALSE)
    }
    
    if(is(annotatedPeak, "GRanges")){
      feature_ids <- unique(annotatedPeak$feature)
    }else{
      if(is.character(annotatedPeak)){
        feature_ids <- unique(annotatedPeak)
      }else{
        stop("annotatedPeak needs to be GRanges type with 
                                 feature variable holding the feature id or a 
             character vector holding the IDs of the features 
             used to annotate the peaks!",call.=FALSE)
      }
    }
  
    feature_ids <- feature_ids[!is.na(feature_ids)]
    feature_ids <- feature_ids[feature_ids!=""]
    if (length(feature_ids) == 0)
    {
        stop("There is no feature column in annotatedPeak or 
             annotatedPeak has size 0!",call.=FALSE)
        
    }
    if(!missing(orgAnn)){
        if(is(orgAnn, "OrgDb")){
            orgAnn <- deparse(substitute(orgAnn))
        }
        if(!is(orgAnn, "character")){
            stop("orgAnn must be a character.")
        }
        if(!grepl(".eg.db",orgAnn,ignore.case=TRUE)){
            stop('Annotation database must be *.eg.db',call.=FALSE)
        }
        is.installed <- function(orgpkg) 
            is.element(orgpkg, installed.packages()[,1])
        if(!is.installed(orgAnn)){
            BiocManager::install(pkgs=orgAnn, 
                     update=FALSE, 
                     ask=FALSE)
        }
        if(!library(orgAnn, 
                    character.only=TRUE, 
                    logical.return=TRUE)){
            if(!silence) 
                message("No valid gene mapping package as 
                        argument orgAnn is passed in!")
            stop("Please refer 
                 http://www.bioconductor.org/packages/release/data/annotation/ 
                 for available org.xx.eg.db packages")
        }
        #	require(orgAnn,character.only = TRUE)
        orgAnn<-sub("\\.db$","",orgAnn,ignore.case=TRUE)
        #get Entrez ID::entrezIDs
        if(feature_id_type=="entrez_id"){
            m_ent<-as.data.frame(feature_ids, stringsAsFactors=FALSE)
            colnames(m_ent)<-c("entrez_id")
        }else{
            prefix<-switch(feature_id_type,
                           gene_alias       = "ALIAS",
                           gene_symbol      = "SYMBOL",
                           ensembl_gene_id  = "ENSEMBL",
                           refseq_id        = "REFSEQ",
                           "UNKNOWN"
            )
            if(prefix=="UNKNOWN"){
                stop("Currently only the following type of IDs are supported: 
entrez_id, gene_alias, ensembl_gene_id, refseq_id and gene_symbol!",
                     call.=FALSE)
            }
            tryCatch(env<-get(paste(orgAnn,prefix,"2EG",sep="")),
                     error = function(e){
                         stop(paste("Annotation database ",
                                    orgAnn,
                                    "2EG does not exist!\n
                                    \tPlease try to load annotation 
                                    database by library(",
                                    orgAnn,".db)",sep=""),call.=FALSE)
            })
            entrez <- AnnotationDbi::mget(feature_ids,env,ifnotfound=NA)
            gene_ids <- names(entrez)
            m_ent <- do.call(rbind,lapply(gene_ids,function(.ele){
                r = entrez[[.ele]]
                if(!is.na(r[1])) cbind(rep(.ele,length(r)),r)
                else {
                    if(!silence) message(paste("entrez id for '", 
                                               .ele, "' not found\n", 
                                               sep = ""))
                    c(.ele, NA)
                }
            }))
            m_ent<-as.data.frame(m_ent, stringsAsFactors=FALSE)
            m_ent<-m_ent[!is.na(m_ent[,1]), , drop=FALSE]
            colnames(m_ent)<-c(feature_id_type, "entrez_id")
            }
        entrezIDs<-as.character(m_ent$entrez_id)
        entrezIDs<-unique(entrezIDs)
        entrezIDs<-entrezIDs[!is.na(entrezIDs)]
        if(length(entrezIDs)==0){
            stop("No entrez identifier can be mapped by input data based on 
                 the feature_id_type.\nPlease consider to use correct 
                 feature_id_type, orgAnn or annotatedPeak\n",
                 call.=FALSE);
        }
        IDs2Add <- unique(IDs2Add)
        IDs2Add <- IDs2Add[IDs2Add!=feature_id_type]
        IDs <- unique(entrezIDs[!is.na(entrezIDs)])
        for(IDtoAdd in IDs2Add){
            x<-NULL
            if(!silence) message(paste("Adding",IDtoAdd,"... "))
            if(IDtoAdd!="entrez_id"){
                orgDB<-NULL
                tryCatch(orgDB<-get(paste(orgAnn,toupper(IDtoAdd),sep="")),
                         error = function(e){
                             if(!silence){
                                 message(paste("The IDs2Add you input, \"", 
                                               IDtoAdd, 
                                               "\", is not supported!\n",
                                               sep=""))
                    }
                })
                if(is.null(orgDB)){
                    IDs2Add<-IDs2Add[IDs2Add!=IDtoAdd]
                    next
                }
                if(!is(orgDB, "AnnDbBimap") & !is(orgDB, "IpiAnnDbMap")){
                    if(!silence){
                        message(paste("The IDs2Add you input, \"", 
                                      IDtoAdd, 
                                      "\", is not supported!\n",
                                      sep=""))
                    }
                    IDs2Add<-IDs2Add[IDs2Add!=IDtoAdd]
                    next
                }
                x <- AnnotationDbi::mget(IDs, orgDB,ifnotfound=NA)
                x <- sapply(x,base::paste,collapse=";")
                x <- as.data.frame(x, stringsAsFactors=FALSE)
                m_ent <- merge(m_ent, x, 
                               by.x="entrez_id",
                               by.y="row.names",
                               all.x=TRUE)
                colnames(m_ent)[length(colnames(m_ent))]<-IDtoAdd
            }
            if(!silence) message("done\n")
        }
        m_ent<-m_ent[, c(feature_id_type,IDs2Add), drop=FALSE]
}else{
    if(missing(mart) || !is(mart, "Mart")){
        stop('No valid mart object is passed in!',call.=FALSE)
    }
    IDs2Add<-unique(IDs2Add)
    IDs2Add<-IDs2Add[IDs2Add!=feature_id_type]
    tryCatch(m_ent<-
                 getBM(attributes=c(feature_id_type,IDs2Add),
                       filters = feature_id_type, 
                       values = feature_ids, mart=mart),
             error = function(e){
        stop(paste("Get error when calling getBM:", e, sep="\n"),
             call.=FALSE)
    })
    if(any(colnames(m_ent)!=c(feature_id_type, IDs2Add))) 
        colnames(m_ent) <- c(feature_id_type, IDs2Add)
}
if(!silence) message("prepare output ... ")
#dealing with multiple entrez_id for single feature_id
if(ncol(m_ent)==1) stop("None of IDs could be appended. Please double check IDs2Add.")
duplicated_ids <-
    m_ent[duplicated(m_ent[,feature_id_type]), feature_id_type]
if(length(duplicated_ids)>0){
    m_ent.duplicated <- m_ent[m_ent[,feature_id_type] %in% duplicated_ids,]
    m_ent.duplicated <- condenseMatrixByColnames(as.matrix(m_ent.duplicated),
                                                 feature_id_type)
    m_ent<-m_ent[!(m_ent[,feature_id_type] %in% duplicated_ids),]
    m_ent<-rbind(m_ent,m_ent.duplicated)
}
if (is(annotatedPeak, "GRanges")){
    #rearrange m_ent by annotatedPeak$feature
    #data.frame is very important for order...
    orderlist <- data.frame(annotatedPeak$feature)
    orderlist <- cbind(1:nrow(orderlist),orderlist)
    colnames(orderlist) <- c("orderid___",feature_id_type)
    m_ent <- merge(orderlist, m_ent, by=feature_id_type, all.x=TRUE)
    m_ent <- m_ent[order(m_ent[,"orderid___"]), 
                   c(feature_id_type, IDs2Add)]
    for(IDtoAdd in IDs2Add){
        mcols(annotatedPeak)[,IDtoAdd] <- m_ent[,IDtoAdd]
    }
}else{
    annotatedPeak <- m_ent
}
if(!silence) message("done\n")
annotatedPeak
}
LihuaJulieZhu/ChIPpeakAnno documentation built on Aug. 5, 2020, 12:02 a.m.