R/getMappedEntrezIDs.R
In missMethyl: Analysing Illumina HumanMethylation BeadChip Data

Documented in getMappedEntrezIDs

#' Get mapped Entrez Gene IDs from CpG probe names
#' 
#' Given a set of CpG probe names and optionally all the CpG sites tested, this
#' function outputs a list containing the mapped Entrez Gene IDs as well as the
#' numbers of probes per gene, and a vector indicating significance.
#' 
#' This function is used by the gene set testing functions \code{gometh} and
#' \code{gsameth}. It maps the significant CpG probe names to Entrez Gene IDs,
#' as well as all the CpG sites tested. It also calculates the numbers of
#' probes for gene. Input CpGs are able to be restricted by genomic features 
#' using the \code{genomic.features} argument.
#' 
#' Genes associated with each CpG site are obtained from the annotation package
#' \code{IlluminaHumanMethylation450kanno.ilmn12.hg19} if the array type is
#' "450K". For the EPIC array, the annotation package
#' \code{IlluminaHumanMethylationEPICanno.ilm10b4.hg19} is used. To use a
#' different annotation package, please supply it using the \code{anno}
#' argument.
#' 
#' @param sig.cpg Character vector of significant CpG sites used for testing
#' gene set enrichment.
#' @param all.cpg Character vector of all CpG sites tested. Defaults to all CpG
#' sites on the array.
#' @param array.type The Illumina methylation array used. Options are "450K" or
#' "EPIC".
#' @param anno Optional. A \code{DataFrame} object containing the complete
#' array annotation as generated by the \code{\link{minfi}}
#' \code{\link{getAnnotation}} function. Speeds up execution, if provided.
#' @param genomic.features Character vector or scalar indicating whether the 
#' gene set enrichment analysis should be restricted to CpGs from specific 
#' genomic locations. Options are "ALL", "TSS200","TSS1500","Body","1stExon",
#' "3'UTR","5'UTR","ExonBnd"; and the user can select any combination. Defaults
#' to "ALL".
#' @return A list with the following elements \item{sig.eg}{ mapped Entrez Gene
#' IDs for the significant probes } \item{universe}{ mapped Entrez Gene IDs for
#' all probes on the array, or for all the CpG probes tested.  } \item{freq}{
#' table output with numbers of probes associated with each gene } 
#' \item{equiv}{ table output with equivalent numbers of probes associated
#' with each gene taking into account multi-gene bias} \item{de}{ a
#' vector of ones and zeroes of the same length of universe indicating which
#' genes in the universe are significantly differentially methylated.  }
#' \item{fract.counts}{ a dataframe with 2 columns corresponding to the Entrez
#' Gene IDS for the significant probes and the associated weight to account for
#' multi-gene probes.  }
#' @author Belinda Phipson
#' @seealso \code{\link{gometh},\link{gsameth}}
#' @examples
#' 
#' \dontrun{ # to avoid timeout on Bioconductor build
#' library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
#' library(org.Hs.eg.db)
#' library(limma)
#' ann <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
#' 
#' # Randomly select 1000 CpGs to be significantly differentially methylated
#' sigcpgs <- sample(rownames(ann),1000,replace=FALSE)
#' 
#' # All CpG sites tested
#' allcpgs <- rownames(ann)
#' 
#' mappedEz <- getMappedEntrezIDs(sigcpgs,allcpgs,array.type="450K")
#' names(mappedEz)
#' # Entrez IDs of the significant genes
#' mappedEz$sig.eg[1:10]
#' # Entrez IDs for the universe
#' mappedEz$universe[1:10]
#' # Number of CpGs per gene
#' mappedEz$freq[1:10]
#' # Equivalent numbers of CpGs measured per gene
#' mappedEz$equiv[1:10]
#' A vector of 0s and 1s indicating which genes in the universe are significant
#' mappedEz$de[1:10]
#' }
#' 
#' @export getMappedEntrezIDs
getMappedEntrezIDs <- function(sig.cpg, all.cpg=NULL, 
                               array.type=c("450K","EPIC"), anno=NULL, 
                               genomic.features = c("ALL", "TSS200","TSS1500",
                                                    "Body","1stExon","3'UTR",
                                                    "5'UTR","ExonBnd"))
    # From a list of CpG sites, obtain the Entrez Gene IDs that are used for 
    # testing pathway enrichment
    # Belinda Phipson & Jovana Maksimovic
    # 10 February 2016
    # Updated 12 May 2020 to allow restricting sig.cpg by genomic features
{
    # check input
    sig.cpg <- as.character(sig.cpg)
    sig.cpg <- sig.cpg[!is.na(sig.cpg)]
    
    array.type <- match.arg(toupper(array.type), c("450K","EPIC"))    
    genomic.features <- match.arg(genomic.features, c("ALL", "TSS200","TSS1500",
                                                      "Body", "1stExon","3'UTR",
                                                      "5'UTR","ExonBnd"), 
                                  several.ok = TRUE)
    
    if(length(genomic.features) > 1 & any(grepl("ALL", genomic.features))){
        message("All input CpGs are used for testing.") 
        genomic.features <- "ALL"   
    } 
    
    if(array.type == "450K" & any(grepl("ExonBnd", genomic.features))){
        stop("'ExonBnd' is not an annotated feature on 450K arrays,
           please remove it from your genomic.feature parameter
           specification.") 
    }
    
    # Get annotaton in appropriate format
    if(is.null(anno)){
        flat.u <- .getFlatAnnotation(array.type)
    } else {
        flat.u <- .getFlatAnnotation(array.type,anno)
    }
    
    if(is.null(all.cpg)) {
        all.cpg <- unique(flat.u$cpg)
    } else {
        all.cpg <- as.character(all.cpg)
        all.cpg <- all.cpg[!is.na(all.cpg)]
        all.cpg <- unique(all.cpg)
    }
    
    # remove CpGs from annotation that are not in all.cpg
    m_all <- match(flat.u$cpg, all.cpg)
    flat.u = flat.u[!is.na(m_all),]
    
    # map CpG sites to entrez gene id's
    sig.cpg <- unique(sig.cpg)
    
    m1 <- match(flat.u$cpg,sig.cpg)
    
    #eg.sig <- flat.u$entrezid[!is.na(m1)]
    if(any(grepl("ALL", genomic.features))){
        eg.sig <- flat.u$entrezid[!is.na(m1)]
    } else {
        # select only genes with sig. CpGs mapping to certain genomic features
        eg.sig <- flat.u$entrezid[!is.na(m1) & flat.u$group %in% genomic.features]
    }
    
    eg.sig <- unique(eg.sig)
    if(length(eg.sig)==0) {
        stop("There are no genes annotated to the significant CpGs")
    }
    
    m2 <- match(flat.u$cpg,all.cpg)
    eg.all <- flat.u$entrezid[!is.na(m2)]
    
    freq_genes <- table(eg.all)
    eg.universe <- names(freq_genes)
    
    test.de <- as.integer(eg.universe %in% eg.sig)
    
    sorted.eg.sig <- eg.universe[test.de==1]
    
    multimap <- data.frame(table(flat.u$cpg))
    multimap$Var1 <- as.character(multimap$Var1)
    m3 <- match(flat.u$cpg, multimap$Var1)
    flat.u$multimap <- multimap$Freq[m3]
    
    flat.u$inv.multimap <- 1/flat.u$multimap
    
    equivN <- tapply(flat.u$inv.multimap,flat.u$entrezid,sum)
    mm <- match(eg.universe,names(equivN))
    equivN <- equivN[mm]
    
    #sig.flat <- flat.u[!is.na(m1),]
    if(any(grepl("ALL", genomic.features))){
        sig.flat <- flat.u[!is.na(m1),]
    } else {
        # select only CpGs that map to certain genomic features
        sig.flat <- flat.u[!is.na(m1) & flat.u$group %in% genomic.features, ]
    }
    
    fract <- data.frame(weight=pmin(tapply(1/sig.flat$multimap,
                                           sig.flat$entrezid,sum),
                                    1))
    
    m4 <- match(sorted.eg.sig,rownames(fract))
    fract.counts <- fract$weight[m4]
    
    out <- list(sig.eg = sorted.eg.sig, universe = eg.universe, 
                freq = freq_genes, equiv =  equivN, de = test.de, 
                fract.counts = data.frame(sigid=sorted.eg.sig,frac=fract.counts))
    out
}