R/EpiMutations.R

Defines functions empty_bumps format_bumps select_outlier_bumps compute_bump_outlier_scores filter_bumps run_bumphunter make_bumphunter_design filter_set set_concat check_params epimutations_per_sample epimutations

Documented in format_bumps run_bumphunter

#' Return epimutations as per Aref-Eshghi et al, 2020.
#'
#' @param cases (GenomicRatioSet, ExpressionSet) Case dataset.
#' @param controls (GenomicRatioSet, ExpressionSet) Control dataset. Optional.
#' @param sample_ids (character vector) The column names in cases to compute 
#' epimutations for.
#' If missing, computes for all samples in cases.
#' @param cases_as_controls (bool) If True, for each case, all other cases are 
#' added in controls. If False, they are ignored.
#' @param args.bumphunter (list) Additional arguments to pass to 
#' \link[bumphunter]{bumphunter}.
#' @param num.cpgs (integer) Epimutations containing fewer cpgs than num.cpgs 
#' are discarded.
#' @param pValue.cutoff (numeric) If method is "manova" or "mlm", only keep
#' epimutations with a p-value inferior to pValue.cutoff. 
#' @param fStat_min (numeric) If method is "manova", only keep
#' epimutations with a F-statistic superior to fStat_min.
#' @param betaDiff_min (numeric) If method is "manova", only keep epimutations
#' with an absolute average regional methylation difference superior to betaDiff_min.
#' @param outlier.score (numeric) If method is "iso.forest", only keep
#' epimutations with an outlier_score superior to outlier.score.
#' @param nsamp (string) If method is "Mahdist.MCD", the sample subsetting
#' strategy: "best", "exact" or "deterministic". See \code{\link{epiMahdist.MCD}}.
#' @param method (string) The outlier scoring method. Choose from 
#' "manova", "mlm", "iso.forest", "Mahdist.MCD".
#'
#' @return A tibble of epimutation regions for sample_id.
#' \describe{
#' \item{sample}{(string) Sample_id in colnames(cases)}
#' \item{chr}{(string) Chromosome}
#' \item{start}{(integer) Start position on chromosome}
#' \item{end}{(integer) End position on chromosome}
#' \item{length}{(integer) Region length on chromosome}
#' \item{n_cpgs}{(integer) Number of CpGs spanned by region}
#' \item{cpg_ids}{(string) Comma-separated CpG_ids in rownames(cases) spanned by region}
#' \item{outlier_method}{(string) The outlier scoring method. Either
#' "manova", "mlm", "iso.forest" or "Mahdist.MCD".}
#' \item{outlier_score}{(numeric) The outlier score. A F-statistic for methods 
#' "manova" or "mlm". A score produced by \code{\link[isotree]{isolation.forest}} 
#' for method "iso.forest". A boolean for method "Mahdist.MCD".}
#' \item{outlier_significance}{(numeric) The outlier significance statistic.
#' P(>F-statistic) for methods "manova" or "mlm". NA for all other methods.}
#' }
#' @export
#'
#' @examples
#' data("genomicratioset") # load toy dataset
#' epi <- epimutations(genomicratioset)
epimutations <- function(
  cases,
  controls,
  sample_ids,
  cases_as_controls = T,
  args.bumphunter = list(cutoff=0.1),
  num.cpgs = 3,
  pValue.cutoff = 0.01,
  fStat_min = 20,
  betaDiff_min = 0.2,
  outlier.score = 0.5,
  nsamp = "deterministic",
  method = "manova",
  reduced_output = T
) {
  
  check_params(cases, controls, method, cases_as_controls, sample_ids)

  set <- set_concat(cases, controls)
  
  if(missing(sample_ids)){
    sample_ids <- colnames(cases)
  }
  epis <- lapply(
    sample_ids,
    function(sample_id) {
      epimutations_per_sample(
        set, sample_id, cases_as_controls, args.bumphunter, num.cpgs,
        pValue.cutoff, fStat_min, betaDiff_min, outlier.score, nsamp, method, reduced_output
      )
    }
  )
  epis <- do.call(rbind, epis)
  
  return(epis)
}

epimutations_per_sample <- function(
  set,
  sample_id,
  cases_as_controls = T,
  args.bumphunter = list(cutoff=0.1),
  num.cpgs = 3,
  pValue.cutoff = 0.01,
  fStat_min = 20,
  betaDiff_min = 0.2,
  outlier.score = 0.5,
  nsamp = "deterministic",
  method = "manova",
  reduced_output = T
){
  set <- filter_set(set, sample_id, cases_as_controls)
  
  design <- make_bumphunter_design(set, sample_id)
  bumps <- do.call(run_bumphunter,
                   c(list(set=set, design=design), args.bumphunter))
  
  bumps <- filter_bumps(bumps, min_cpgs_per_bump=num.cpgs)
  bumps <- compute_bump_outlier_scores(set, bumps, method, sample_id, design, nsamp)
  bumps <- select_outlier_bumps(bumps, method, pValue.cutoff, fStat_min, betaDiff_min, outlier.score)
  
  epi <- format_bumps(bumps, set, sample_id, method, reduced_output)
  
  return(epi)
}


check_params <- function(cases, controls, method, cases_as_controls, sample_ids){

  if(missing(cases)) {
    stop("Cases argument is missing.")
  }
  if(missing(controls) && !cases_as_controls){
    stop("If controls is missing, cases_as_controls must be set to TRUE.")
  }
  if(!class(cases) %in% c("GenomicRatioSet", "ExpressionSet")) {
    stop("Cases must be of class 'GenomicRatioSet' or 'ExpressionSet'")  
  }
  if(missing(controls) && ncol(cases) < 3){
    stop("If controls is missing, cases must contain at least 3 samples.")
  }
  if(!missing(controls) && ncol(cases) == 0){
    stop("Cases must contain at least 1 sample.")
  }
  if(!missing(controls) && !class(controls) %in% c("GenomicRatioSet", "ExpressionSet")) {
    stop("Controls must be of class 'GenomicRatioSet' or 'ExpressionSet'")  
  }
  if(!missing(controls) && ncol(controls) < 2) {
    stop("Controls must contain at least 2 samples.")
  }
  if(length(method) != 1) {
    stop("Only one method can be chosen at a time.")
  }
  if(!method %in% c("manova", "mlm", "iso.forest", "Mahdist.MCD")) {
    stop("Method must be 'manova', 'mlm','iso.forest','Mahdist.MCD'")  
  }
  if(!missing(sample_ids) && any(!sample_ids %in% colnames(cases))){
    stop("Sample_ids must match column names in the cases object.")  
  }
    n_controls <- ifelse(missing(controls), 0, ncol(controls))
    if(cases_as_controls) n_controls <- n_controls + ncol(cases) - 1
    if(method == "manova" && n_controls < 10){
        warning("At least 10 control samples are recommended for the manova method.")
    }
}

set_concat <- function(cases, controls){
  Biobase::pData(cases)[["origin"]] <- "case"
  if(missing(controls)){
    return(cases)
  }
  Biobase::pData(controls)[["origin"]] <- "control"
  if(class(cases) == "GenomicRatioSet") {
    set <- minfi::combineArrays(
      controls, cases,
      outType = c("IlluminaHumanMethylation450k",
        "IlluminaHumanMethylationEPIC",
        "IlluminaHumanMethylation27k"),
        verbose = TRUE
    )
  } else if (class(cases) == "ExpressionSet") {
    set <- a4Base::combineTwoExpressionSet(controls, cases)
  }
  return(set)
}

filter_set <- function(set, sample_id, cases_as_controls){
  if(!cases_as_controls){
    mask <- (
      Biobase::pData(set)[["origin"]] == "control" 
        | colnames(set) == sample_id
    )
    set <- set[, mask]
  }
  return(set)
}

make_bumphunter_design <- function(set, sample_id){
  # model is single sample, no covariates: 0,0,0,0...0,0,1
  Biobase::pData(set)$samp <- colnames(set) == sample_id
  return(stats::model.matrix(~samp, Biobase::pData(set)))
}

#' Run the Bumphunter algorithm.
#'
#' See \link[bumphunter]{bumphunter} doc for details.
#'
#' @keywords internal
#' 
#' @param set A GenomicRatioSet or ExpressionSet
#' @param design Design matrix with rows representing samples and columns 
#' representing covariates. Regression is applied to each row of set.
#' @param ... Extra arguments to pass to \link[bumphunter]{bumphunter} 
#'
#' @return The object returned by \link[bumphunter]{bumphunter}, or if no bumps
#' are found, a mock bumps object with zero rows returned by \code{empty_bumps}.
run_bumphunter <- function(set, design, ...){
	if(class(set) == "GenomicRatioSet") {
		bumps <- bumphunter::bumphunter(set, design, ...)$table
	} else if (class(set) == "ExpressionSet") {
		bumps <- bumphunter::bumphunter(
		    object = Biobase::exprs(set),
		    design = design,
		    pos = Biobase::fData(set)$RANGE_START,
		    chr = Biobase::fData(set)$CHR,
		    ...
		)$table
	}
	if(length(bumps) == 1 && is.na(bumps)){ # bumphunter found no bumps
		bumps <- empty_bumps()
	}
	return(bumps)
}

filter_bumps <- function(bumps, min_cpgs_per_bump){
  bumps <- subset(bumps, L >= min_cpgs_per_bump)
  return(bumps)
}

compute_bump_outlier_scores <- function(set, bumps, method, sample, model, nsamp){
  bumps$outlier_score <- bumps$outlier_significance <- rep(NA_real_, nrow(bumps))
  if(method == "manova") bumps$beta_diff <- rep(NA_real_, nrow(bumps))
  for(i in seq_len(nrow(bumps))) {
    betas <- get_betas(bumps[i, ], set)
    if(method == "manova") {
        manova_has_enough_samples <- nrow(betas) >= ncol(betas) + 2
      if(manova_has_enough_samples) {
          stats_manova <- epi_manova(betas, model, sample)
          bumps$outlier_score[i] <- stats_manova["approx F"]
          bumps$outlier_significance[i] <- stats_manova["Pr(>F)"]
          bumps$beta_diff[i] <- stats_manova["beta_mean_abs_diff"]
      }
    } else if(method == "mlm") {
        stats_mlm <- epiMLM(betas, model)
      bumps$outlier_score[i] <- stats_mlm["F value"]
      bumps$outlier_significance[i] <- stats_mlm["Pr(>F)"]
    } else if(method == "iso.forest") {
      bumps$outlier_score[i] <- epiIsolationForest(betas, sample)
    } else if(method == "Mahdist.MCD") {
      bumps$outlier_score[i] <- epiMahdist.MCD(betas, nsamp, sample)
    }
  }
  
  return(bumps)
}

select_outlier_bumps <- function(bumps, method, pValue.cutoff, fStat_min, betaDiff_min, outlier.score){

    if(method == "manova"){
        outliers <- subset(
            bumps,
            outlier_score >= fStat_min &
                beta_diff >= betaDiff_min &
                outlier_significance < pValue.cutoff 
        )
    } else if(method == "mlm"){
        outliers <- subset(bumps, outlier_significance < pValue.cutoff)
    } else if(method == "iso.forest"){
        outliers <- subset(bumps, outlier_score > outlier.score)
    } else if(method == "Mahdist.MCD"){
        outliers <- subset(bumps, outlier_score == TRUE)
    }
  
  return(outliers)
}

#' Convert bumps object to a tibble with additional formatting.
#'
#' @keywords internal
#'
#' @param bumps (bumps) Bump object returned by \link[bumphunter]{bumphunter}.
#' @param set (GenomicRatioSet, ExpressionSet) The dataset on which the bumps
#' were computed.
#' @param sample (string) The sample name for all the bumps.
#' @param reduced (bool) If True, only return a limited number of columns.
#' If False, return a full set of columns.
#'
#' @return A tibble dataframe where each row is an epimutation.
format_bumps <- function(bumps, set, sample, method, reduced){
	df_out <- tibble::as_tibble(bumps)
	df_out$sample <- df_out$outlier_method <- character(nrow(df_out))
	if(nrow(bumps) > 0){
		df_out$sample <- sample
		df_out$outlier_method <- method
	}
	df_out$length <- df_out$end - df_out$start + 1
	df_out$n_cpgs <- df_out$indexEnd - df_out$indexStart + 1
	df_out$cpg_ids <- mapply(
		function(rown, i_st, i_end){paste(rown[i_st:i_end], collapse=",")},
		df_out$indexStart,
		df_out$indexEnd,
		MoreArgs = list(rown = rownames(set))
	)
	if(reduced){
		reduced_col <- c("sample", "chr", "start", "end", "length", "n_cpgs", "cpg_ids",
		                 "outlier_method", "outlier_score", "outlier_significance")
		df_out <- df_out[, reduced_col]
	}
	return(df_out)
}

empty_bumps <- function(){
	bumps <- tibble::tibble(
		"chr" = character(),
		"start" = integer(),
		"end" = integer(),
		"value" = numeric(),
		"area" = numeric(),
		"cluster" = numeric(),
		"indexStart" = integer(),
		"indexEnd" = integer(),
		"L" = numeric()
	)
	return(bumps)
}
yocra3/epimutacions documentation built on Jan. 1, 2021, 1:46 p.m.