R/LMC_interpretation.R

Defines functions lmc.go.enrichment do.go.plot do.lola.plot lmc.go.plots.tables lmc.lola.plots.tables load.lola.for.medecom lmc.lola.enrichment

Documented in lmc.go.enrichment lmc.go.plots.tables lmc.lola.enrichment lmc.lola.plots.tables load.lola.for.medecom

###############################################################################################################
# LMC_interpreation.R
# functions to interpret LMCs and to assign potential functions to the LMCs
###############################################################################################################

#' lmc.lola.enrichment
#' 
#' This routine computes LOLA enrichment results for LMC-specifically hypo- or hypermethylated sites. 
#' 
#' @param medecom.result An object of type \code{\link{MeDeComSet-class}} or the location of an .RData file, where
#'                 such an object is stored.
#' @param annotation.filter A numeric vector specifying the sites that have been removed from \code{rnb.set} in a
#'                 preprocessing step (e.g. coverage filtering) or a path to an .RData file.
#' @param anno.data The original \code{\link[RnBeads]{RnBSet-class}} object containing methylation, sample meta and annotation
#'                 information, a path to a directory stored by \code{\link[RnBeads]{save.rnb.set}} or a data.frame containing
#'                 CpG annotations (ann_C)
#' @param K The number of LMCs specified for the MeDeCom run.
#' @param lambda The lambda parameter selected.
#' @param cg_subset The index of the selection strategy employed (e.g. most variable CpGs).
#' @param diff.threshold The difference cutoff between median methylation in the remaining LMCs and the LMC of interest
#'                  used to call a CpG differentially methylated. The higher this value, the more conservative the
#'                  selection.
#' @param reference.computation Metric used to set the reference on the remaining LMCs to determine hyper- and hypomethylated sites.
#'                  Can be either \code{"median"} (default), \code{"mean"}, or \code{"lmcs"} (\code{comp.lmcs} argument needs to be provided).
#' @param comp.lmcs Numeric vector containing two numbers representing the LMCs that should be compared to one another.                 
#' @param region.type Region type used to annotate CpGs to potentially regulatory regions (see \url{https://rnbeads.org/regions.html})
#'                  for a list of available region types.
#' @param temp.dir Path to a directory used to store temporary files.
#' @param type Which direction is to be tested for enrichment. Can be one of "hypo", "hyper", or "differential"
#' @param assembly The assembly used. Needs to be one of "hg19", "hg38" or "mm10". Does not need to be specified, if rnb.set is a
#'                 \code{\link{RnBSet-class}}
#' @param lola.db A loaded LOLA database as loaded with LOLA::loadRegionDB. If this value is NULL, the database is loaded
#'                 automatically and stored in the temporary directory.
#' @return A list with K elements. One element is the enrichment result of the corresponding LMC-specific hypomethylated CpG sites.
#' @export
#' @details This function employs LOLA on the CpG sites that are LMC-specifically hypomethylated, after annotating 
#'                  the sites to the closest region defined by \code{region.type}. The sites are selected by computing
#'                  the median methylation value of the other LMCs and then selecting those sites that are more than 
#'                  \code{diff.threshold} away from the median in the LMC of interest. This is done for all LMCs from
#'                  1 to K.
#' @seealso lmc.lola.plot.tables
#' @author Michael Scherer                                

lmc.lola.enrichment <- function(medecom.result,
                                annotation.filter=NULL,
                                anno.data,
                                K=NULL,
                                lambda=NULL,
                                cg_subset=NULL,
                                diff.threshold=0.5,
                                reference.computation="median",
                                comp.lmcs=NULL,
                                region.type="ensembleRegBuildBPall",
                                temp.dir=tempdir(),
                                type="hypo",
                                assembly="hg19",
                                lola.db=NULL){
  #require("RnBeads")
  if(!requireNamespace("LOLA")){
    if(!requireNamespace("BiocManager")) install.packages("BiocManager",repos="https://cloud.r-project.org/")
    BiocManager::install("LOLA",dependencies=T)
    if(!requireNamespace("simpleCache")) install.packages("simpleCache",repos="https://cloud.r-project.org/")
  }
  require("LOLA")
  require("qvalue")
  rnb.mode <- F
  if(is.null(cg_subset)){
    cg_subset <- medecom.result@parameters$cg_subsets[1]
  }else if(!(cg_subset %in% medecom.result@parameters$cg_subsets)){
    stop("Invalid value for cg_subset; not available in medecom.result")
  }
  if(is.null(K)){
    K <- medecom.result@parameters$Ks[1]
  }else if(!(K %in% medecom.result@parameters$Ks)){
    stop("Invalid value for K; not available in medecom.result")
  }
  if(is.null(lambda)){
    lambda <- medecom.result@parameters$lambdas[1]
  }else if(!(lambda %in% medecom.result@parameters$lambdas)){
    stop("Invalid value for lambdas; not available in medecom.result")
  }
  if(inherits(anno.data,"RnBSet")){
    rnb.mode <- T
    assembly <- assembly(anno.data)
  }
  if(is.character(medecom.result)){
    new.envi <- new.env()
    load(medecom.result,envir = new.envi)
    medecom.result <- get(ls(envir = new.envi),envir = new.envi)
  }
  if(is.character(annotation.filter)){
    new.envi <- new.env()
    load(annotation.filter,envir = new.envi)
    annotation.filter <- get(ls(envir = new.envi),envir = new.envi)
  }
  if(!reference.computation %in% c("median","mean","lmcs")){
    stop("Invalid value for reference.computation: Only mean and median are supported")
  }else{
    if(reference.computation %in% "median"){
      ref.func <- rowMedians
    }else if(reference.computation %in% "mean"){
      ref.func <- rowMeans
    }else{
      if(any(comp.lmcs > K) || length(comp.lmcs) != 2){
        stop("Invalid value for comp.lmcs: Should specify two LMC used for comparison")
      }
    }
  }
  if(is.character(anno.data)){
    options(fftempdir=temp.dir)
    anno.data <- load.rnb.set(anno.data)
  }
  if(!region.type %in% rnb.region.types()){
    rnb.load.annotation.from.db(region.type,assembly=assembly)
  }
  agg.type <- unlist(rnb.get.annotation(region.type,assembly=assembly))
  if(rnb.mode){
    anno <- annotation(anno.data)
  }else{
    anno <- anno.data
    rm(anno.data)
  }
  if(!is.null(annotation.filter)){
    anno <- anno[annotation.filter,]
  }
  if(!is.null(medecom.result@parameters$GROUP_LISTS)){
    sset <- medecom.result@parameters$GROUP_LIST[[cg_subset]]
    anno <- anno[sset,]
  }
  anno <- GRanges(Rle(anno$Chromosome),IRanges(start=anno$Start,end=anno$End))
  op <- findOverlaps(anno,agg.type)
  agg.type <- agg.type[subjectHits(op)]
  if(is.null(lola.db)){
    lola.db <- load.lola.for.medecom(temp.dir,assembly = assembly)
  }
  lmcs <- getLMCs(medecom.result,cg_subset=cg_subset,K=K,lambda=lambda)
  lola.results <- list()
  for(i in 1:K){
    if(reference.computation %in% "lmcs"){
      i <- comp.lmcs[1]
      first.lmc <-  lmcs[,i]
      ref.lmc <- lmcs[,comp.lmcs[2]]
    }else{
      first.lmc <- lmcs[,i]
      ref.lmc <- ref.func(lmcs[,-i,drop=F])
    }
    lmc.diff <- ref.lmc - first.lmc
    if(type=="hypo"){
      is.hypo <- lmc.diff > diff.threshold
    }else if (type=="hyper"){
      is.hypo <- lmc.diff < -diff.threshold
    }else if (type=="differential"){
      is.hypo <- (lmc.diff < -diff.threshold) | (lmc.diff > diff.threshold)
    }else{
      stop("Invalid value for type, needs to be one of 'hypo', 'hyper' or 'differential'")
    }
    first.sites <- anno[is.hypo]
    op <- findOverlaps(first.sites,agg.type)
    first.sites <- agg.type[subjectHits(op)]
    lola.res <- NA
    if(length(first.sites)>0){
      lola.res <- runLOLA(userSets=unique(first.sites),userUniverse=agg.type,regionDB=lola.db)
    }
    comp <- paste0("LMC",i)
    lola.results[[comp]] <- lola.res
    print(paste("LMC",i,"processed"))
    if(reference.computation %in% "lmcs"){
      break
    }
  }
  return(lola.results)
}

#' load.lola.for.medecom
#' 
#' This functions downloads and loads the LOLA database in the specified directory. Should only be called once per session to save time.
#' 
#' @param dir.path A path to a directory, where the LOLA database is to be downloaded. Defaults to the temporary directory.
#' @param assembly The assembly to be used.
#' @return The loaded LOLA database
#' @export
#' @author Michael Scherer
load.lola.for.medecom <- function(dir.path=tempdir(),assembly="hg19"){
  require("LOLA")
  require("simpleCache")
  dir <- downloadLolaDbs(dir.path,"LOLACore")
  lola.db <- loadRegionDB(dir[[assembly]])
  return(lola.db)
}

#' lmc.lola.plots.tables
#' 
#' This functions calls \link{lmc.lola.enrichment} and returns plots representing those results, as well as the tables with LOLA
#' enrichment results.
#' 
#' @param medecom.result An object of type \code{\link{MeDeComSet-class}} or the location of an .RData file, where
#'                 such an object is stored.
#' @param annotation.filter A numeric vector specifying the sites that have been removed from \code{rnb.set} in a
#'                 preprocessing step (e.g. coverage filtering) or a path to an .RData file.
#' @param anno.data The original \code{\link[RnBeads]{RnBSet-class}} object containing methylation, sample meta and annotation
#'                 information, a path to a directory stored by \code{\link[RnBeads]{save.rnb.set}} or a data.frame containing
#'                 CpG annotations (ann_C)
#' @param K The number of LMCs specified for the MeDeCom run.
#' @param lambda The lambda parameter selected.
#' @param cg_subset The index of the selection strategy employed (e.g. most variable CpGs).
#' @param diff.threshold The difference cutoff between median methylation in the remaining LMCs and the LMC of interest
#'                  used to call a CpG differentially methylated. The higher this value, the more conservative the
#'                  selection.
#' @param region.type Region type used to annotate CpGs to potentially regulatory regions (see \url{https://rnbeads.org/regions.html})
#'                  for a list of available region types.
#' @param temp.dir Path to a directory used to store temporary files.
#' @param type Which direction is to be tested for enrichment. Can be one of "hypo", "hyper", or "differential"
#' @param assembly The assembly used. Needs to be one of "hg19", "hg38" or "mm10". Does not need to be specified, if rnb.set is a
#'                 \code{\link{RnBSet-class}}
#' @param lola.db A loaded LOLA database as loaded with LOLA::loadRegionDB. If this value is NULL, the database is loaded
#'                 automatically and stored in the temporary directory.
#' @param ... Further arguments passed to \code{lmc.lola.enrichment}
#' @return A list with two elements, one of them containing the plots for each LMC and the other for the corresponding LOLA
#'         enrichment tables
#' @seealso lmc.lola.enrichment
#' @author Michael Scherer                 
#' 
#' @export

lmc.lola.plots.tables <- function(medecom.result,
                           annotation.filter=NULL,
                           anno.data,
                           K=NULL,
                           lambda=NULL,
                           cg_subset=NULL,
                           diff.threshold=0.5,
                           region.type="ensembleRegBuildBPall",
                           temp.dir=tempdir(),
                           type="hypo",
                           assembly="hg19",
                           lola.db=NULL,
                           ...){
  enrichment.results <- lmc.lola.enrichment(medecom.result,
                                          annotation.filter=NULL,
                                          anno.data,
                                          K=NULL,
                                          lambda=NULL,
                                          cg_subset=NULL,
                                          diff.threshold=0.5,
                                          region.type="ensembleRegBuildBPall",
                                          temp.dir=tempdir(),
                                          type="hypo",
                                          assembly="hg19",
                                          lola.db=NULL,
                                          ...)
  lola.plots <- lapply(enrichment.results,do.lola.plot,lola.db)
  return(list(Plots=lola.plots,Tables=enrichment.results))
}

#' lmc.go.plots.tables
#' 
#' This functions calls \link{lmc.go.enrichment} and returns plots representing those results, as well as the tables with GO
#' enrichment results.
#' 
#' @param medecom.result An object of type \code{\link{MeDeComSet-class}} or the location of an .RData file, where
#'                 such an object is stored.
#' @param anno.data The original \code{\link[RnBeads]{RnBSet-class}} object containing methylation, sample meta and annotation
#'                 information, a path to a directory stored by \code{\link[RnBeads]{save.rnb.set}} or a data.frame containing
#'                 CpG annotations (ann_C)
#' @param ... Further arguments passed to \code{lmc.lola.enrichment}
#' @return A list with two elements, one of them containing the plots for each LMC and the other for the corresponding GO
#'         enrichment tables
#' @seealso lmc.go.enrichment
#' @author Michael Scherer                 
#' 
#' @export

lmc.go.plots.tables <- function(medecom.result,
                                  anno.data,
                                  ...){
  enrichment.results <- lmc.go.enrichment(medecom.result=medecom.result,
                                            anno.data=anno.data,
                                            ...)
  go.plots <- lapply(enrichment.results,do.go.plot)
  return(list(Plots=go.plots,Tables=enrichment.results))
}

#' do.lola.plot
#' 
#' This functions creates a single LOLA enrichment plot
#' 
#' @param enrichment.result LOLA enrichment result for a single LMC
#' @param lola.db LOLA database that was used
#' @param pvalCut P-value cutoff
#' @return An object of type ggplot, containing the enrichment plot
#' @author Michael Scherer
#' @noRd

do.lola.plot <- function(enrichment.result,lola.db,pvalCut=0.01){
  if(!is.na(enrichment.result)){
 	  plot <- lolaBarPlot(lolaDb = lola.db,lolaRes=enrichment.result,pvalCut=pvalCut)+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))
  }else{
    plot <- ggplot(data.frame(x=c(0,1),y=c(0.1)))+geom_text(x=0.5,y=0.5,label="No data to be plotted")+theme_bw()
  }
  return(plot)
}

#' do.go.plot
#' 
#' This function creates a single GO enrichmnet plot
#' 
#' @param enrichment.result GO enrichment result for a single LMC
#' @param pvalCut P-value cutoff
#' @return An object of type ggplot, containing the enrichment plot
#' @author Michael Scherer
#' @noRd
 
do.go.plot <- function(enrichment.result, pvalCut=0.01){
  if(!is.na(enrichment.result)){
  	to.plot <- enrichment.result[enrichment.result$p.val.adj.fdr<pvalCut,]
  	if(nrow(to.plot)>0){
    	plot <- ggplot(to.plot,aes(x=Term,y=OddsRatio))+geom_histogram(stat = "identity")+theme_bw()+theme(axis.text.x = element_text(angle=45,hjust = 1))
  	}else{
  	  plot <- ggplot(data.frame(x=c(0,1),y=c(0.1)))+geom_text(x=0.5,y=0.5,label="No significant assocation found")+theme_bw()
	  }
  }else{
    plot <- ggplot(data.frame(x=c(0,1),y=c(0.1)))+geom_text(x=0.5,y=0.5,label="No data to be plotted")+theme_bw()
  }
  return(plot)
}

#' lmc.go.enrichment
#' 
#' This routine computes GO enrichment results for LMC-specifically hypo- or hypermethylated sites. 
#' 
#' @param medecom.result An object of type \code{\link{MeDeComSet-class}} or the location of an .RData file, where
#'                 such an object is stored.
#' @param annotation.filter A numeric vector specifying the sites that have been removed from \code{rnb.set} in a
#'                 preprocessing step (e.g. coverage filtering) or a path to an .RData file.
#' @param anno.data The original \code{\link[RnBeads]{RnBSet-class}} object containing methylation, sample meta and annotation
#'                 information or a path to a directory stored by \code{\link[RnBeads]{save.rnb.set}} or a data.frame containing
#'                 CpG annotations (ann_C)
#' @param K The number of LMCs specified for the MeDeCom run.
#' @param lambda The lambda parameter selected.
#' @param cg_subset The index of the selection strategy employed (e.g. most variable CpGs).
#' @param diff.threshold The difference cutoff between median methylation in the remaining LMCs and the LMC of interest
#'                  used to call a CpG differentially methylated. The higher this value, the more conservative the
#'                  selection.
#' @param reference.computation Metric used to set the reference on the remaining LMCs to determine hyper- and hypomethylated sites.
#'                  Can be either \code{"median"} (default), \code{"mean"}, or \code{"lmcs"} (\code{comp.lmcs} argument needs to be provided).
#' @param comp.lmcs Numeric vector containing two numbers representing the LMCs that should be compared to one another.                 
#' @param region.type Region type used to annotate CpGs to potentially regulatory regions (see \url{https://rnbeads.org/regions.html})
#'                  for a list of available region types. Here, only "genes" "promoters" and their gencode versions
#'                  are available.
#' @param temp.dir Path to a directory used to store temporary files.
#' @param type Which direction is to be tested for enrichment. Can be one of "hypo", "hyper", or "differential"
#' @param assembly The assembly used. Needs to be one of "hg19", "hg38" or "mm10". Does not need to be specified, if rnb.set is a
#'                 \code{\link{RnBSet-class}}
#' @return A list with K elements. One element is the enrichment result of the corresponding LMC-specific hypomethylated CpG sites.
#' @export
#' @details This function employs GO enrichment analysis with the GOstats package on the CpG sites that are LMC-specifically
#'                  hypomethylated, after annotating the sites to the closest promotor/gene defined by \code{region.type}.
#'                  The sites are selected by computing the median methylation value of the other LMCs and then selecting
#'                  those sites that are more than \code{diff.threshold} away from the median in the LMC of interest.
#'                  This is done for all LMCs from 1 to K.
#' @author Michael Scherer                                

lmc.go.enrichment <- function(medecom.result,
                                  annotation.filter=NULL,
                                  anno.data,
                                  K=NULL,
                                  lambda=NULL,
                                  cg_subset=NULL,
                                  diff.threshold=0.5,
                                  reference.computation="median",
                                  comp.lmcs=NULL,
                                  region.type="genes",
                                  temp.dir=tempdir(),
                                  type="hypo",
                                  assembly="hg19"){
  #require("RnBeads")
  require("GOstats")
  rnb.mode <- F
  if(is.null(cg_subset)){
    cg_subset <- medecom.result@parameters$cg_subsets[1]
  }else if(!(cg_subset %in% medecom.result@parameters$cg_subsets)){
    stop("Invalid value for cg_subset; not available in medecom.result")
  }
  if(is.null(K)){
    K <- medecom.result@parameters$Ks[1]
  }else if(!(K %in% medecom.result@parameters$Ks)){
    stop("Invalid value for K; not available in medecom.result")
  }
  if(is.null(lambda)){
    lambda <- medecom.result@parameters$lambdas[1]
  }else if(!(lambda %in% medecom.result@parameters$lambdas)){
    stop("Invalid value for lambdas; not available in medecom.result")
  }
  if(inherits(anno.data,"RnBSet")){
    rnb.mode <- T
    assembly <- assembly(anno.data)
  }
  if(is.character(medecom.result)){
    new.envi <- new.env()
    load(medecom.result,envir = new.envi)
    medecom.result <- get(ls(envir = new.envi),envir = new.envi)
  }
  if(is.character(annotation.filter)){
    new.envi <- new.env()
    load(annotation.filter,envir = new.envi)
    annotation.filter <- get(ls(envir = new.envi),envir = new.envi)
  }
  if(!reference.computation %in% c("median","mean","lmcs")){
    stop("Invalid value for reference.computation: Only mean and median are supported")
  }else{
    if(reference.computation %in% "median"){
      ref.func <- rowMedians
    }else if(reference.computation %in% "mean"){
      ref.func <- rowMeans
    }else{
      if(any(comp.lmcs > K) || length(comp.lmcs) != 2){
        stop("Invalid value for comp.lmcs: Should specify two LMC used for comparison")
      }
    }
  }
  if(is.character(anno.data)){
    options(fftempdir=temp.dir)
    anno.data <- load.rnb.set(anno.data)
  }
  if(!region.type %in% rnb.region.types()){
    rnb.load.annotation.from.db(region.type,assembly=assembly)
  }
  agg.type <- unlist(rnb.get.annotation(region.type,assembly=assembly))
  entrez.id <- values(agg.type)$entrezID
  longer <- lengths(strsplit(entrez.id,";"))>0
  entrez.id[longer] <- unlist(lapply(strsplit(entrez.id,";"),function(x)x[1]))[longer]
  if(rnb.mode){
    anno <- annotation(anno.data)
  }else{
    anno <- anno.data
  }
  if(!is.null(annotation.filter)){
    anno <- anno[annotation.filter,]
  }
  if(!is.null(medecom.result@parameters$GROUP_LISTS)){
    sset <- medecom.result@parameters$GROUP_LIST[[cg_subset]]
    anno <- anno[sset,]
  }
  anno <- GRanges(Rle(anno$Chromosome),IRanges(start=anno$Start,end=anno$End))
  op <- findOverlaps(anno,agg.type)
  agg.type <- agg.type[subjectHits(op)]
  entrez.id <- entrez.id[subjectHits(op)]
  lmcs <- getLMCs(medecom.result,cg_subset=cg_subset,K=K,lambda=lambda)
  go.results <- list()
  for(i in 1:K){
    if(reference.computation %in% "lmcs"){
      first.lmc <-  lmcs[,comp.lmcs[1]]
      ref.lmc <- lmcs[,comp.lmcs[2]]
    }else{
      first.lmc <- lmcs[,i]
      ref.lmc <- ref.func(lmcs[,-i,drop=F])
    }
    lmc.diff <- ref.lmc - first.lmc
    if(type=="hypo"){
      is.hypo <- lmc.diff > diff.threshold
    }else if (type=="hyper"){
      is.hypo <- lmc.diff < -diff.threshold
    }else if (type=="differential"){
      is.hypo <- (lmc.diff < -diff.threshold) | (lmc.diff > diff.threshold)
    }else{
      stop("Invalid value for type, needs to be one of 'hypo', 'hyper' or 'differential'")
    }
    first.sites <- anno[is.hypo]
    op <- findOverlaps(first.sites,agg.type)
    first.ids <- entrez.id[subjectHits(op)]
    go.res <- NA
    if(length(first.ids)>0){
      if(assembly %in% c("hg19","hg38")){
        params <- new("GOHyperGParams",annotation="org.Hs.eg.db",geneIds = first.ids, universeGeneIds = entrez.id, ontology = "BP",conditional = TRUE, testDirection = "over")
      }else if (assembly %in% "mm10"){
        params <- new("GOHyperGParams",annotation="org.Mm.eg.db",geneIds = first.ids, universeGeneIds = entrez.id, ontology = "BP",conditional = TRUE, testDirection = "over")
      }
      test.result.hyper <- tryCatch(
        hyperGTest(params),
        error = function(e) {
          print(e)
        }
      )
      if(!inherits(test.result.hyper,"error")){
        go.res <- GOstats::summary(test.result.hyper)
        if(!is.null(dim(go.res)) && !is.null(go.res) && nrow(go.res)>0){
          go.res$p.val.adj.fdr <- p.adjust(go.res$Pvalue,method="fdr",n=length(test.result.hyper@pvalue.order))
        }else{
          go.res <- NA
        }
      }else{
        go.res <- NA
      }
    }
    comp <- paste0("LMC",i)
    go.results[[comp]] <- go.res
    print(paste("LMC",i,"processed"))
    if(reference.computation %in% "lmcs"){
      break
    }
  }
  return(go.results)
}
lutsik/MeDeCom documentation built on Feb. 15, 2023, 11:32 a.m.