#' Get mapped Entrez Gene IDs from CpG probe names
#'
#' Given a set of CpG probe names and optionally all the CpG sites tested, this
#' function outputs a list containing the mapped Entrez Gene IDs as well as the
#' numbers of probes per gene, and a vector indicating significance.
#'
#' This function is used by the gene set testing functions \code{gometh} and
#' \code{gsameth}. It maps the significant CpG probe names to Entrez Gene IDs,
#' as well as all the CpG sites tested. It also calculates the numbers of
#' probes for gene. Input CpGs are able to be restricted by genomic features
#' using the \code{genomic.features} argument.
#'
#' Genes associated with each CpG site are obtained from the annotation package
#' \code{IlluminaHumanMethylation450kanno.ilmn12.hg19} if the array type is
#' "450K". For the EPIC array, the annotation package
#' \code{IlluminaHumanMethylationEPICanno.ilm10b4.hg19} is used. To use a
#' different annotation package, please supply it using the \code{anno}
#' argument.
#'
#' @param sig.cpg Character vector of significant CpG sites used for testing
#' gene set enrichment.
#' @param all.cpg Character vector of all CpG sites tested. Defaults to all CpG
#' sites on the array.
#' @param array.type The Illumina methylation array used. Options are "450K" or
#' "EPIC".
#' @param anno Optional. A \code{DataFrame} object containing the complete
#' array annotation as generated by the \code{\link{minfi}}
#' \code{\link{getAnnotation}} function. Speeds up execution, if provided.
#' @param genomic.features Character vector or scalar indicating whether the
#' gene set enrichment analysis should be restricted to CpGs from specific
#' genomic locations. Options are "ALL", "TSS200","TSS1500","Body","1stExon",
#' "3'UTR","5'UTR","ExonBnd"; and the user can select any combination. Defaults
#' to "ALL".
#' @return A list with the following elements \item{sig.eg}{ mapped Entrez Gene
#' IDs for the significant probes } \item{universe}{ mapped Entrez Gene IDs for
#' all probes on the array, or for all the CpG probes tested. } \item{freq}{
#' table output with numbers of probes associated with each gene }
#' \item{equiv}{ table output with equivalent numbers of probes associated
#' with each gene taking into account multi-gene bias} \item{de}{ a
#' vector of ones and zeroes of the same length of universe indicating which
#' genes in the universe are significantly differentially methylated. }
#' \item{fract.counts}{ a dataframe with 2 columns corresponding to the Entrez
#' Gene IDS for the significant probes and the associated weight to account for
#' multi-gene probes. }
#' @author Belinda Phipson
#' @seealso \code{\link{gometh},\link{gsameth}}
#' @examples
#'
#' \dontrun{ # to avoid timeout on Bioconductor build
#' library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
#' library(org.Hs.eg.db)
#' library(limma)
#' ann <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
#'
#' # Randomly select 1000 CpGs to be significantly differentially methylated
#' sigcpgs <- sample(rownames(ann),1000,replace=FALSE)
#'
#' # All CpG sites tested
#' allcpgs <- rownames(ann)
#'
#' mappedEz <- getMappedEntrezIDs(sigcpgs,allcpgs,array.type="450K")
#' names(mappedEz)
#' # Entrez IDs of the significant genes
#' mappedEz$sig.eg[1:10]
#' # Entrez IDs for the universe
#' mappedEz$universe[1:10]
#' # Number of CpGs per gene
#' mappedEz$freq[1:10]
#' # Equivalent numbers of CpGs measured per gene
#' mappedEz$equiv[1:10]
#' A vector of 0s and 1s indicating which genes in the universe are significant
#' mappedEz$de[1:10]
#' }
#'
#' @export getMappedEntrezIDs
getMappedEntrezIDs <- function(sig.cpg, all.cpg=NULL,
array.type=c("450K","EPIC"), anno=NULL,
genomic.features = c("ALL", "TSS200","TSS1500",
"Body","1stExon","3'UTR",
"5'UTR","ExonBnd"))
# From a list of CpG sites, obtain the Entrez Gene IDs that are used for
# testing pathway enrichment
# Belinda Phipson & Jovana Maksimovic
# 10 February 2016
# Updated 12 May 2020 to allow restricting sig.cpg by genomic features
{
# check input
sig.cpg <- as.character(sig.cpg)
sig.cpg <- sig.cpg[!is.na(sig.cpg)]
array.type <- match.arg(toupper(array.type), c("450K","EPIC"))
genomic.features <- match.arg(genomic.features, c("ALL", "TSS200","TSS1500",
"Body", "1stExon","3'UTR",
"5'UTR","ExonBnd"),
several.ok = TRUE)
if(length(genomic.features) > 1 & any(grepl("ALL", genomic.features))){
message("All input CpGs are used for testing.")
genomic.features <- "ALL"
}
if(array.type == "450K" & any(grepl("ExonBnd", genomic.features))){
stop("'ExonBnd' is not an annotated feature on 450K arrays,
please remove it from your genomic.feature parameter
specification.")
}
# Get annotaton in appropriate format
if(is.null(anno)){
flat.u <- .getFlatAnnotation(array.type)
} else {
flat.u <- .getFlatAnnotation(array.type,anno)
}
if(is.null(all.cpg)) {
all.cpg <- unique(flat.u$cpg)
} else {
all.cpg <- as.character(all.cpg)
all.cpg <- all.cpg[!is.na(all.cpg)]
all.cpg <- unique(all.cpg)
}
# remove CpGs from annotation that are not in all.cpg
m_all <- match(flat.u$cpg, all.cpg)
flat.u = flat.u[!is.na(m_all),]
# map CpG sites to entrez gene id's
sig.cpg <- unique(sig.cpg)
m1 <- match(flat.u$cpg,sig.cpg)
#eg.sig <- flat.u$entrezid[!is.na(m1)]
if(any(grepl("ALL", genomic.features))){
eg.sig <- flat.u$entrezid[!is.na(m1)]
} else {
# select only genes with sig. CpGs mapping to certain genomic features
eg.sig <- flat.u$entrezid[!is.na(m1) & flat.u$group %in% genomic.features]
}
eg.sig <- unique(eg.sig)
if(length(eg.sig)==0) {
stop("There are no genes annotated to the significant CpGs")
}
m2 <- match(flat.u$cpg,all.cpg)
eg.all <- flat.u$entrezid[!is.na(m2)]
freq_genes <- table(eg.all)
eg.universe <- names(freq_genes)
test.de <- as.integer(eg.universe %in% eg.sig)
sorted.eg.sig <- eg.universe[test.de==1]
multimap <- data.frame(table(flat.u$cpg))
multimap$Var1 <- as.character(multimap$Var1)
m3 <- match(flat.u$cpg, multimap$Var1)
flat.u$multimap <- multimap$Freq[m3]
flat.u$inv.multimap <- 1/flat.u$multimap
equivN <- tapply(flat.u$inv.multimap,flat.u$entrezid,sum)
mm <- match(eg.universe,names(equivN))
equivN <- equivN[mm]
#sig.flat <- flat.u[!is.na(m1),]
if(any(grepl("ALL", genomic.features))){
sig.flat <- flat.u[!is.na(m1),]
} else {
# select only CpGs that map to certain genomic features
sig.flat <- flat.u[!is.na(m1) & flat.u$group %in% genomic.features, ]
}
fract <- data.frame(weight=pmin(tapply(1/sig.flat$multimap,
sig.flat$entrezid,sum),
1))
m4 <- match(sorted.eg.sig,rownames(fract))
fract.counts <- fract$weight[m4]
out <- list(sig.eg = sorted.eg.sig, universe = eg.universe,
freq = freq_genes, equiv = equivN, de = test.de,
fract.counts = data.frame(sigid=sorted.eg.sig,frac=fract.counts))
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.