R/goregion.R
In missMethyl: Analysing Illumina HumanMethylation BeadChip Data

Documented in goregion

#' Gene ontology testing of DMRs for Ilumina methylation array data
#' 
#' Tests gene ontology or KEGG pathway enrichment for differentially methylated
#' regions (DMRs) identified from Illumina's Infinium HumanMethylation450 or
#' MethylationEPIC array, taking into account the differing number of probes
#' per gene present on the array.
#' 
#' This function takes a \code{GRanges} object of DMR coordinates, maps them to
#' CpG sites on the array and then to Entrez Gene IDs, and tests for GO term or
#' KEGG pathway enrichment using Wallenius' noncentral hypergeometric test, 
#' taking into account the number of CpG sites per gene on the 450K/EPIC array.  
#' If \code{prior.prob} is set to FALSE, then prior probabilities are not used 
#' and it is assumed that each gene is equally likely to have a significant CpG 
#' site associated with it. Please not that we have tested \code{goregion} and 
#' \code{gsaregion} extensively using the \code{DMRCate} package to identify 
#' differentially methylated regions (Peters, et al., 2015).
#' 
#' The testing now also takes into account that some CpGs map to multiple genes. 
#' For a small number of gene families, this previously caused their associated 
#' GO categories/gene sets to be erroneously overrepresented and thus highly 
#' significant. If \code{fract.counts=FALSE} then CpGs are allowed to map to 
#' multiple genes (this is NOT recommended).
#' 
#' Genes associated with each CpG site are obtained from the annotation
#' package \code{IlluminaHumanMethylation450kanno.ilmn12.hg19} if the array
#' type is "450K". For the EPIC array, the annotation package
#' \code{IlluminaHumanMethylationEPICanno.ilm10b4.hg19} is used. To use a
#' different annotation package, please supply it using the \code{anno}
#' argument. 
#' 
#' In order to get a list which contains the mapped Entrez gene IDS,
#' please use the \code{getMappedEntrezIDs} function. \code{goregion} tests all
#' GO or KEGG terms, and false discovery rates are calculated using the method
#' of Benjamini and Hochberg (1995). The \code{topGSA} function can be used to 
#' display the top 20 most enriched pathways.
#' 
#' If you are interested in which genes overlap with the genes in the gene set, 
#' setting \code{sig.genes} to TRUE will output an additional column in the 
#' results data frame that contains all the significant differentially 
#' methylated gene symbols, comma separated. The default is FALSE.
#' 
#' For more generalised gene set testing where the user can specify the gene
#' set/s of interest to be tested, please use the \code{gsaregion} function.
#' 
#' 
#' 
#' @param regions \code{GRanges} object of DMR coordinates to test for GO term
#' enrichment.
#' @param all.cpg Character vector of all CpG sites tested. Defaults to all CpG
#' sites on the array.
#' @param collection The collection of pathways to test. Options are "GO" and
#' "KEGG". Defaults to "GO".
#' @param array.type The Illumina methylation array used. Options are "450K" or
#' "EPIC". Defaults to "450K".
#' @param plot.bias Logical, if true a plot showing the bias due to the
#' differing numbers of probes per gene will be displayed.
#' @param prior.prob Logical, if true will take into account the probability of
#' significant differentially methylation due to numbers of probes per gene. If
#' false, a hypergeometric test is performed ignoring any bias in the data.
#' @param anno Optional. A \code{DataFrame} object containing the complete
#' array annotation as generated by the \code{\link{minfi}}.
#' \code{\link{getAnnotation}} function. Speeds up execution, if provided.
#' @param equiv.cpg Logical, if true then equivalent numbers of cpgs are used
#' for odds calculation rather than total number cpgs. Only used if 
#' \code{prior.prob=TRUE}.
#' @param fract.counts Logical, if true then fractional counting of cpgs is used
#' to account for cgps that map to multiple genes. Only used if 
#' \code{prior.prob=TRUE}.
#' @param genomic.features Character vector or scalar indicating whether the 
#' gene set enrichment analysis should be restricted to CpGs from specific 
#' genomic locations. Options are "ALL", "TSS200","TSS1500","Body","1stExon",
#' "3'UTR","5'UTR","ExonBnd"; and the user can select any combination. Defaults
#' to "ALL".
#' @param sig.genes Logical, if true then the significant differentially 
#' methylated genes that overlap with the gene set of interest is outputted 
#' as the final column in the results table. Default is FALSE.
#' 
#' @return A data frame with a row for each GO or KEGG term and the following
#' columns: \item{Term}{ GO term if testing GO pathways } \item{Ont}{ ontology
#' that the GO term belongs to if testing GO pathways. "BP" - biological
#' process, "CC" - cellular component, "MF" - molecular function.  }
#' \item{Pathway}{ the KEGG pathway being tested if testing KEGG terms.  }
#' \item{N}{ number of genes in the GO or KEGG term } \item{DE}{ number of
#' genes that are differentially methylated } \item{P.DE}{ p-value for
#' over-representation of the GO or KEGG term term } \item{FDR}{ False
#' discovery rate } \item{SigGenesInSet}{ Significant differentially methylated 
#' genes overlapping with the gene set of interest. }
#' 
#' @author Jovana Maksimovic
#' @seealso \code{\link{gometh},\link{gsameth},\link{gsaregion}}
#' 
#' @references Phipson, B., Maksimovic, J., and Oshlack, A. (2016). missMethyl:
#' an R package for analysing methylation data from Illuminas
#' HumanMethylation450 platform. \emph{Bioinformatics}, \bold{15};32(2),
#' 286--8. 
#' 
#' Geeleher, P., Hartnett, L., Egan, L. J., Golden, A., Ali, R. A. R.,
#' and Seoighe, C. (2013). Gene-set analysis is severely biased when applied to
#' genome-wide methylation data. \emph{Bioinformatics}, \bold{29}(15),
#' 1851--1857. 
#' 
#' Young, M. D., Wakefield, M. J., Smyth, G. K., and Oshlack, A.
#' (2010). Gene ontology analysis for RNA-seq: accounting for selection bias.
#' \emph{Genome Biology}, 11, R14. 
#' 
#' Ritchie, M. E., Phipson, B., Wu, D., Hu, Y.,
#' Law, C. W., Shi, W., and Smyth, G. K. (2015). limma powers differential
#' expression analyses for RNA-sequencing and microarray studies. \emph{Nucleic
#' Acids Research}, gkv007. 
#' 
#' Benjamini, Y., and Hochberg, Y. (1995). Controlling
#' the false discovery rate: a practical and powerful approach to multiple
#' testing. \emph{Journal of the Royal Statistical Society Series}, B,
#' \bold{57}, 289-300.
#' 
#' Peters, T.J., Buckley, M.J., Statham, A.L., Pidsley, R., Samaras, K., 
#' Lord, R.V., Clark, S.J.,Molloy, P.L. (2015). De novo identification of 
#' differentially methylated regions in the human genome. \emph{Epigenetics & 
#' Chromatin}, \bold{8}, 6.
#' 
#' @examples
#' 
#' \dontrun{ # to avoid timeout on Bioconductor build
#' library(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
#' library(limma)
#' library(DMRcate)
#' library(ExperimentHub)
#' 
#' # Follow the example for the dmrcate function to get some EPIC data from
#' # ExperimentHub
#' eh <- ExperimentHub()
#' FlowSorted.Blood.EPIC <- eh[["EH1136"]]
#' tcell <- FlowSorted.Blood.EPIC[,colData(FlowSorted.Blood.EPIC)$CD4T==100 |
#'                                 colData(FlowSorted.Blood.EPIC)$CD8T==100]
#' detP <- detectionP(tcell)
#' remove <- apply(detP, 1, function (x) any(x > 0.01))
#' tcell <- tcell[!remove,]
#' tcell <- preprocessFunnorm(tcell)
#' #Subset to chr2 only
#' tcell <- tcell[seqnames(tcell) == "chr2",]
#' tcellms <- getM(tcell)
#' tcellms.noSNPs <- rmSNPandCH(tcellms, dist=2, mafcut=0.05)
#' tcell$Replicate[tcell$Replicate==""] <- tcell$Sample_Name[tcell$Replicate==""]
#' tcellms.noSNPs <- avearrays(tcellms.noSNPs, tcell$Replicate)
#' tcell <- tcell[,!duplicated(tcell$Replicate)]
#' tcell <- tcell[rownames(tcellms.noSNPs),]
#' colnames(tcellms.noSNPs) <- colnames(tcell)
#' assays(tcell)[["M"]] <- tcellms.noSNPs
#' assays(tcell)[["Beta"]] <- ilogit2(tcellms.noSNPs)
#' 
#' # Perform region analysis
#' type <- factor(tcell$CellType)
#' design <- model.matrix(~type) 
#' myannotation <- cpg.annotate("array", tcell, arraytype = "EPIC",
#'                              analysis.type="differential", design=design, 
#'                              coef=2)
#' # Run DMRCate with beta value cut-off filter of 0.1                              
#' dmrcoutput <- dmrcate(myannotation, lambda=1000, C=2, betacutoff = 0.1)
#' regions <- extractRanges(dmrcoutput)
#' length(regions)
#' 
#' ann <- getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
#' # All CpG sites tested (limited to chr 2)
#' allcpgs <- rownames(tcell)
#' # GO testing with prior probabilities taken into account
#' # Plot of bias due to differing numbers of CpG sites per gene
#' gst <- goregion(regions = regions, all.cpg = allcpgs, collection = "GO", 
#'                 array.type = "EPIC", plot.bias = TRUE, prior.prob = TRUE, 
#'                 anno = ann)
#' # Table of top GO results
#' topGSA(gst, n=10)
#' 
#' # KEGG testing
#' kegg <- goregion(regions = regions, all.cpg = allcpgs, collection = "KEGG", 
#'                  array.type = "EPIC", prior.prob=TRUE, anno = ann)
#' # Table of top KEGG results
#' topGSA(kegg, n=10)
#' 
#' # Restrict to promoter regions
#' gst.prom <- goregion(regions = regions, all.cpg = allcpgs, collection = "GO", 
#'                 array.type = "EPIC", plot.bias = TRUE, prior.prob = TRUE, 
#'                 anno = ann, genomic.features = c("TSS200","TSS1500"))
#' topGSA(gst.prom, n=10)
#' 
#' # Add significant genes in gene set to KEGG output
#' kegg <- goregion(regions = regions, all.cpg = allcpgs, collection = "KEGG", 
#'                  array.type = "EPIC", prior.prob=TRUE, anno = ann, 
#'                  sig.genes = TRUE)
#' # Table of top KEGG results
#' topGSA(kegg, n=5)
#' }
#' 
#' @export goregion
goregion <- function(regions, all.cpg=NULL, collection=c("GO","KEGG"), 
                     array.type = c("450K","EPIC"), plot.bias=FALSE, 
                     prior.prob=TRUE, anno=NULL, equiv.cpg = TRUE,
                     fract.counts = TRUE, 
                     genomic.features = c("ALL", "TSS200","TSS1500","Body",
                                          "1stExon","3'UTR","5'UTR","ExonBnd"),
                     sig.genes = FALSE)
  # Gene ontology testing or KEGG pathway analysis for differentially methylated regions
  # Takes into account probability of differential methylation based on
  # numbers of probes on array per gene
  # Jovana Maksimovic
  # 26 April 2019. Last updated 1 September 2020.
{
    array.type <- match.arg(toupper(array.type), c("450K","EPIC"))    
    collection <- match.arg(toupper(collection), c("GO","KEGG"))
    genomic.features <- match.arg(genomic.features, c("ALL", "TSS200","TSS1500",
                                                      "Body", "1stExon","3'UTR",
                                                      "5'UTR","ExonBnd"), 
                                  several.ok = TRUE)
    
    if(length(genomic.features) > 1 & any(grepl("ALL", genomic.features))){
      message("All input CpGs are used for testing.") 
      genomic.features <- "ALL"   
    } 
    
    if(array.type == "450K" & any(grepl("ExonBnd", genomic.features))){
        stop("'ExonBnd' is not an annotated feature on 450K arrays,\n
             please remove it from your genomic.feature parameter\n
             specification.") 
    }
  
  if(is.null(anno)){
    if(array.type=="450K"){
      anno <- minfi::getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19::IlluminaHumanMethylation450kanno.ilmn12.hg19)
    } else {
      anno <- minfi::getAnnotation(IlluminaHumanMethylationEPICanno.ilm10b4.hg19::IlluminaHumanMethylationEPICanno.ilm10b4.hg19)
    }
  }
  
  if(!is.null(all.cpg)){
    anno <- anno[all.cpg,]
  }
  
  cpgs <- GenomicRanges::GRanges(seqnames = anno$chr, 
                  ranges = IRanges::IRanges(start = anno$pos, 
                                   end = anno$pos),
                  strand = anno$strand,
                  name = anno$Name)
  
  overlaps <- GenomicRanges::findOverlaps(cpgs,regions)
  sig.cpg <- cpgs$name[from(overlaps)]
  
  result <- gometh(sig.cpg=sig.cpg, all.cpg=all.cpg, collection=collection, 
                   array.type=array.type, plot.bias=plot.bias, 
                   prior.prob=prior.prob, anno=anno, equiv.cpg=equiv.cpg,
                   fract.counts=fract.counts, 
                   genomic.features = genomic.features,
                   sig.genes = sig.genes)
  result
}