#' prepare.CG.subsets
#'
#' This routine selects a subset of CpGs sites used for MeDeCom analysis. Different selection methods are supported.
#'
#' @name prepare.CG.subsets
#'
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#' through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation, sample and optional coverage information.
#' @param marker.selection A vector of strings representing marker selection methods. Available method are \itemize{
#' \item{"\code{all}"} Using all sites available in the input.
#' \item{"\code{pheno}"} Selected are the top \code{n.markers} site that differ between the phenotypic
#' groups defined in data preparation or by \code{\link{rnb.sample.groups}}. Those are
#' selected by employing limma on the methylation matrix.
#' \item{"\code{houseman2012}"} 50k sites determined to be cell-type specific for blood cell types using the Houseman's reference-
#' based deconvolution and the Reinius et al. reference data set. See Houseman et.al. 2012 and Reinis et.al. 2012.
#' NOTE: This option should only be used for whole blood data generated using the 450k array.
#' \item{"\code{houseman2014}"} Selects the sites said to be linked to cell type composition by \code{RefFreeEWAS},
#' which is similar to surrogate variable analysis. See Houseman et.al. 2014.
#' \item{"\code{jaffe2014}"} The 600 sites stated as related to cell-type composition Jaffe et.al. 2014.
#' NOTE: This option should only be used for whole blood data generated using the 450k array.
#' \item{"\code{rowFstat}"} Markers are selected as those found to be associated to the reference cell
#' types with F-statistics. If this option is selected, \code{ref.rnb.set} and \code{ref.pheno.column}
#' need to be specified.
#' \item{"\code{random}"} Sites are randomly selected.
#' \item{"\code{pca}"} Sites are selected as those with most influence on the principal components.
#' \item{"\code{var}"} Selects the most variable sites.
#' \item{"\code{hybrid}"} Selects (n.markers/2) most variable and (n.markers/2) random sites.
#' \item{"\code{range}"} Selects the sites with the largest difference between minimum and maximum
#' across samples.
#' \item{"\code{pcadapt}"} Uses principal component analysis as implemented in the \code{"bigstats"}
#' R package to determine sites that are significantly linked to the potential cell types. This
#' requires specifying K a priori (argument \code{K.prior}). We thank Florian Prive and Sophie
#' Achard for providing the idea and parts of the codes.
#' \item{"\code{edec_stage0}} Employs EDec's stage 0 to infer cell-type specific markers. By default
#' EDec's example reference data is provided. If a specific data set is to be provided, it needs
#' to be done through \code{ref.rnb.set}.
#' \item{"\code{custom}"} Specifying a custom file with indices.
#' }
#' @param n.markers The number of sites to be selected. Defaults to 5000.
#' @param remove.correlated Flag indicating if highly correlated sites are to be removed
#' @param cor.threshold Numeric indicating a threshold above which sites are not to be considered in the feature selection.
#' If \code{"quantile"}, sites correlated higher than the 95th quantile are removed.
#' @param write.files Flag indicating if the selected sites are to be stored on disk.
#' @param out.dir Path to the working directory used for analyis, or data preparation.
#' @param ref.rnb.set An object of type \code{\link{RnBSet-class}} or a path to such an object stored on disk,
#' if \code{rowFstat} is selected.
#' @param ref.pheno.column Optional argument stating the column name of the phenotypic table of \code{ref.rnb.set} with
#' the reference cell type.
#' @param n.prin.comp Optional argument deteriming the number of prinicipal components used for selecting the most important sites.
#' @param range.diff Optional argument specifying the difference between maximum and minimum required.
#' @param custom.marker.file Optional argument containing an absolute path to a file that specifies the indices used for employing MeDeCom. Can be provided
#' either as an \code{RDS} file containing a vector of indices to select or as a \code{txt, csv, tsv} file containing each index
#' to be selected as a single row.
#' @param store.heatmaps Flag indicating if a heatmap of the selected input sites is to be create from the input methylation matrix.
#' The files are then stored in the 'heatmaps' folder in out.dir.
#' @param heatmap.sample.col Column name in the phenotypic table of \code{rnb.set}, used for creating a color scheme in the heatmap.
#' @param K.prior K determined from visual inspection. Only has an influence, if \code{marker.selection="pcadapt"}.
#' @return List of indices, one entry for each marker selection method specified by \code{marker.selection}. The indices correspond
#' to the sites that should be used in \code{rnb.set}.
#' @details For methods "\code{houseman2012}" and "\code{jaffe2014}", a predefined set of markers is used. Since those correspond to
#' absolute indices on the chip, the provided \code{rnb.set} must not be preprocessed and therefore still contain all sites.
#' For the other metods, you may used \code{\link{prepare.data}} to filter sites for quality and context.
#' @references \itemize{
#' \item{1.} Houseman, E. A., Accomando, W. P., Koestler, D. C., Christensen, B. C., Marsit, C. J., Nelson, H. H., ..., Kelsey, K.
#' T. (2012). DNA methylation arrays as surrogate measures of cell mixture distribution. BMC Bioinformatics, 13.
#' \item{2.} Reinius, L. E., Acevedo, N., Joerink, M., Pershagen, G., Dahlen, S. E., Greco, D., ..., A., & Kere, J.
#' (2012). Differential DNA methylation in purified human blood cells: Implications for cell lineage and studies on disease susceptibility.
#' PLoS ONE, 7(7). https://doi.org/10.1371/journal.pone.0041361
#' \item{3.} Houseman, E. A., Molitor, J., & Marsit, C. J. (2014). Reference-free cell mixture adjustments in analysis of
#' DNA methylation data. Bioinformatics, 30(10), 1431-1439. https://doi.org/10.1093/bioinformatics/btu029
#' \item{4.} Jaffe, A. E., & Irizarry, R. A. (2014). Accounting for cellular heterogeneity is critical in epigenome-wide
#' association studies. Genome Biology, 15(2), R31. https://doi.org/10.1186/gb-2014-15-2-r31
#'
#' }
#' @export
#' @author Michael Scherer, Pavlo Lutsik
#' @import limma
#' @import bigstatsr
#' @import robust
#' @import genefilter
prepare.CG.subsets<-function(
meth.data=NULL,
rnb.set=NULL,
marker.selection,
n.markers=5000,
remove.correlated=FALSE,
cor.threshold="quantile",
write.files=FALSE,
out.dir=NA,
ref.rnb.set=NULL,
ref.pheno.column=NULL,
n.prin.comp=10,
range.diff=0.05,
custom.marker.file="",
store.heatmaps=F,
heatmap.sample.col=NULL,
K.prior=NULL
)
{
cg.groups<-list()
groups<-1:length(marker.selection)
if(is.null(meth.data)){
if(inherits(rnb.set,"RnBSet")){
meth.data<-meth(rnb.set)
}else{
stop("Invalid value for rnb.set, needs to be RnBSet")
}
}
if(!is.data.frame(meth.data)&&!is.matrix(meth.data)){
stop("Invalid value for meth.data")
}
if(store.heatmaps){
if(!is.null(heatmap.sample.col)){
if(!heatmap.sample.col %in% colnames(pheno(rnb.set))){
logger.error("heatmap.sample.col not a column name of the phenotypic table")
}
trait <- as.factor(pheno(rnb.set)[,heatmap.sample.col])
palette <- rainbow(length(levels(trait)))
sample.cols <- palette[as.integer(trait)]
}else{
sample.cols <- NULL
}
if(!dir.exists(file.path(out.dir,"heatmaps"))){
dir.create(file.path(out.dir,"heatmaps"))
}
}
if(n.markers>nrow(meth.data)){
logger.error("You cannot select more markers than you have sites")
}
ind.all <- 1:nrow(meth.data)
if(remove.correlated){
if(inherits(rnb.set,'RnBSet')){
logger.start("Removing highly correlated features")
if(!isImputed(rnb.set)){
meth.data <- rnb.execute.imputation(meth.data)
}
logger.start("Correlation matrix computation for chromosomes separately")
anno.set <- annotation(rnb.set)
all.cor <- rep(NA,nrow(anno.set))
for(chr in unique(anno.set$Chromosome)){
to.cor <- meth.data[anno.set$Chromosome %in% chr,]
if(any(is.na(to.cor))){
logger.error(paste("Missing values in correlation for chromosome",chr))
}
cor.mat <- abs(cor(t(to.cor)))
diag(cor.mat) <- NA
max.cor <- apply(cor.mat,1,mean,na.rm=T)
all.cor[anno.set$Chromosome %in% chr] <- max.cor
}
logger.completed()
logger.start("removing features")
if(!is.numeric(cor.threshold)){
if(cor.threshold != "quantile"){
logger.error("cor.threshold only allows numeric values and 'quantile'")
}
cor.threshold <- quantile(all.cor,0.95,na.rm = T)
}
logger.completed()
if(cor.threshold < 0 | cor.threshold > 1){
logger.error("cor.threshold needs to be between 0 and 1")
}
logger.info(paste("cor.threshold is",cor.threshold))
logger.info(paste("Removed ",sum(all.cor>cor.threshold),"sites in correlation filtering"))
ind.all <- intersect(ind.all,which(all.cor<=cor.threshold))
logger.completed()
}else{
logger.warning("Removing highly correlated features only applicable for 'RnBSet' input")
}
}
for(group in groups){
ind<-ind.all
if(marker.selection[group]=="all"){
#Do nothing
ind <- ind
}
if(marker.selection[group]=="pheno"){
if(!requireNamespace('limma')){
stop("Missing required package 'limma' for option 'pheno'. Please install it.")
}
if(file.exists(sprintf("%s/pheno.RData",out.dir))){
load(sprintf("%s/pheno.RData",out.dir))
sel.columns <- gsub(" ","_",colnames(pheno.data))
}else{
pheno.data <- pheno(rnb.set)
sel.columns <- gsub(" ","_",names(rnb.sample.groups(rnb.set)))
}
X <- meth.data
ncgs<-nrow(X)
X <- na.omit(X)
colnames(pheno.data) <- gsub(" ","_",colnames(pheno.data))
na.pheno <- lapply(sel.columns,function(x)is.na(pheno.data[,x]))
rem.samples <- rep(FALSE,ncol(X))
for(entry in na.pheno){
rem.samples <- rem.samples | entry
}
X <- X[,!rem.samples]
formula.text <- paste0("~0+", paste(sel.columns,collapse="+"))
design <- model.matrix(as.formula(formula.text), data=pheno.data)
fit <- lmFit(X,design)
fit <- eBayes(fit)
tstatDeltaAll<-abs(fit$t)
ranks<-apply(tstatDeltaAll, 2, rank)
maxRank<-apply(ranks, 1, max)
ind<-ind[order(maxRank)[1:min(n.markers, length(ind))]]
}
if(marker.selection[group]=="houseman2012" ){
logger.warning("The houseman 2012 method should only be used for blood data sets generated on the 450k array.")
loc <- system.file(file.path("extdata","houseman.50k.markers.RDS"),package="DecompPipeline")
if(file.exists(loc)){
houseman.50k.markers<-readRDS(loc)
ind<-intersect(ind, houseman.50k.markers)
}
}
if(marker.selection[group]=="houseman2014"){
stop('RefFreeEWAS is currently not available from CRAN!')
# if(!requireNamespace('RefFreeEWAS')){
# stop("Missing required package 'RefFreeEWAS'. Please install it from 'https://cran.r-project.org/web/packages/RefFreeEWAS/index.html'.")
# }
# if(file.exists(sprintf("%s/pheno.RData",out.dir))){
# load(sprintf("%s/pheno.RData",out.dir))
# sel.columns <- gsub(" ","_",colnames(pheno.data))
# }else{
# pheno.data <- pheno(rnb.set)
# sel.columns <- gsub(" ","_",names(rnb.sample.groups(rnb.set)))
# }
#
# X <- meth.data
# ncgs<-nrow(X)
# X <- na.omit(X)
# colnames(pheno.data) <- gsub(" ","_",colnames(pheno.data))
# na.pheno <- lapply(sel.columns,function(x)is.na(pheno.data[,x]))
# rem.samples <- rep(FALSE,ncol(X))
# for(entry in na.pheno){
# rem.samples <- rem.samples | entry
# }
# X <- X[,!rem.samples]
# pheno.data <- pheno.data[!rem.samples,]
# level.problem <- unlist(lapply(sel.columns,function(x,mat){
# length(unique(mat[,x])) == length(levels(mat[,x]))
# },pheno.data))
# sel.columns <- sel.columns[level.problem]
# if(length(sel.columns)>0){
# formula.text <- paste0("~0+", paste(sel.columns,collapse="+"))
# }else{
# stop("Insufficient phenotypic information present")
# }
# design <- model.matrix(as.formula(formula.text), data=pheno.data)
# tmpBstar <- (X %*% design %*% solve(t(design)%*%design))
#
# ## rescaling the residuals, and addition by Andres
# R <- X-tmpBstar %*% t(design)
# rescale.residual<-TRUE
# if(rescale.residual){
# R <- t(scale(t(R)))
# }
# d<-EstDimRMT(R)$dim
#
# logger.info(c("Estimated number of latent components is", d))
# rf <- RefFreeEwasModel(X, design, d)
#
# logger.status("Fitted the RefFreeEWAS model")
#
# Delta <- rf$Bstar-rf$Beta
#
# nboot<-100
# #### compute se delta from a bootstrap
# rfBoot <- BootRefFreeEwasModel(rf,nboot)
# logger.status("Pefrormed the bootstrap")
#
# seDelta <- apply(rfBoot[,,"B*",]-rfBoot[,,"B",], 1:2, sd)
# tstatDelta <- -abs(Delta)/seDelta
# ncgs <- nrow(meth.data)
# nnas <- apply(is.na(meth.data),1,sum)
# notna.rows <- which(nnas==0)
# if(is.null(dim(tstatDelta))){
# tstatDeltaAll<-rep(NA, ncgs)
# tstatDeltaAll[notna.rows]<-tstatDelta
# }else{
# tstatDeltaAll<-matrix(NA_real_, ncol=ncol(tstatDelta), nrow=ncgs)
# tstatDeltaAll[notna.rows,]<-tstatDelta
# }
# if(is.null(dim(tstatDelta))){
# ind<-ind[order(-tstatDeltaAll)[1:min(10000, length(ind))]]
# }else{
# ranks<-apply(-tstatDeltaAll, 2, rank)
# maxRank<-apply(ranks, 1, max)
# ind<-ind[order(maxRank)[1:min(n.markers, length(ind))]]
# }
}
if(marker.selection[group]=="jaffe2014"){
logger.warning("The jaffe 2014 method should only be used for blood data sets generated on the 450k array.")
jaffe.markers <- readRDS(system.file(file.path("extdata","jaffe.irrizzary.markers.600.RDS"),package="DecompPipeline"))
ind<-intersect(ind, jaffe.markers)
}
if(marker.selection[group]==("rowFstat")){
####### This needs to be checked: it doesn't seem to do what it should
if(!requireNamespace('genefilter')){
stop("Missing required package 'genefilter'. Please install.")
}
if(!(is.null(ref.rnb.set) || is.null(ref.pheno.column))){
load.env<-new.env(parent=emptyenv())
if(inherits(ref.rnb.set,"RnBSet")){
rnb.ref.set <- ref.rnb.set
}else{
rnb.ref.set <- load.rnb.set(ref.rnb.set)
}
meth.data.ref <- meth(rnb.ref.set)
pheno.data.ref <- pheno(rnb.ref.set)
marker.fstat<-rowFtests(meth.data.ref[ind,], as.factor(pheno.data.ref[[ref.pheno.column]]))
subset<-which(rank(-marker.fstat$statistic)<=n.markers)
ind<-ind[subset]
}else{
logger.error("ref.rnb.set and ref.pheno.column need to be specified, if rowFstat is selected.")
}
}
if(marker.selection[group]=="random"){
subset<-sample.int(length(ind), min(n.markers, length(ind)))
ind<-ind[subset]
}
if(marker.selection[group]=="pca"){
meth.data.subs<-meth.data[ind,]
pca<-prcomp(t(meth.data.subs))
rot<-pca$rotation[,1:min(n.prin.comp,ncol(pca$rotation))]
pca.ind<-integer()
add.sites <- ceiling(n.markers/n.prin.comp)
for(cix in 1:min(n.prin.comp,ncol(pca$rotation))){
pca.ind<-union(pca.ind,order(abs(rot[,cix]), decreasing = TRUE)[1:add.sites])
}
ind<-ind[sort(pca.ind)]
}
if(marker.selection[group] == "var"){
sds<-apply(meth.data[ind,], 1, sd)
ind<-ind[order(sds, decreasing=TRUE)[1:min(length(ind),n.markers)]]
}
if(marker.selection[group] == "hybrid"){
sds<-apply(meth.data[ind,], 1, sd)
var.set<-ind[order(sds, decreasing=TRUE)[1:min(length(ind),ceiling(n.markers/2))]]
random.set<-sample(setdiff(ind,var.set), floor(n.markers/2))
ind<-sort(c(var.set, random.set))
rm(var.set)
rm(random.set)
}
if(marker.selection[group] == "range"){
ranges<-apply(meth.data[ind,], 1, range)
ranges<-ranges[2,]-ranges[1,]
ind<-ind[ranges>range.diff]
rm(ranges)
}
if(marker.selection[group] == "pcadapt"){
if(!requireNamespace('bigstatsr')||!requireNamespace('robust')){
stop("Missing required package 'bigstatsr' or 'robus'. Please install.")
}
if(is.null(K.prior)){
stop("K.prior needs to be specific for pcadapat marker selection method")
}
meth.t <- t(meth.data[ind,])
meth.t <- as_FBM(meth.t)
svd <- big_SVD(meth.t,fun.scaling = big_scale(),k=K.prior)
singular.vectors <- svd$u
z.scores <- sapply(cols_along(singular.vectors), function(k) {
big_univLinReg(meth.t, singular.vectors[, k])$score
})
dist.math <- robust::covRob(z.scores, estim = "pairwiseGK")$dist
lpval <- pchisq(dist.math, df = K.prior, lower.tail = FALSE, log.p = TRUE) / log(10)
ind <- ind[order(lpval,decreasing = F)[1:n.markers]]
}
if(marker.selection[group]=="edec_stage0"){
if(!requireNamespace("EDecExampleData")||!requireNamespace("EDec")){
stop("Missing required package 'EDec' or 'EDecExampleData'. Please install for using 'edec_stage0'")
}
if(is.null(ref.rnb.set)){
markers <- EDec::run_edec_stage_0(reference_meth = EDecExampleData::reference_meth,
reference_classes = EDecExampleData::reference_meth_class,
max_p_value = 1e-5,
num_markers = n.markers)
}else{
if(inherits(ref.rnb.set,"RnBSet")){
rnb.ref.set <- ref.rnb.set
}else{
rnb.ref.set <- load.rnb.set(ref.rnb.set)
}
if(!ref.pheno.column %in% colnames(pheno(rnb.ref.set))){
stop("Supplied ref.pheno.column not in phenotypic information")
}
markers <- EDec::run_edec_stage_0(reference_meth = meth(rnb.ref.set),
reference_classes = pheno(rnb.ref.set)[,ref.pheno.column],
max_p_value = 1e-5,
num_markers = n.markers)
}
if(is.null(rnb.set)){
if(!grep("cg",row.names(meth.data))){
stop("Row names of meth.data (cg-identifiers) need to be provided for EDec")
}
ind <- which(row.names(meth.data) %in% markers)
}else{
ind <- which(row.names(annotation(rnb.set)) %in% markers)
}
}
if(marker.selection[group]=="custom"){
if(file.exists(custom.marker.file)){
if(grepl("RDS",custom.marker.file)){
custom_filter<-readRDS(custom.marker.file)
}else if(grepl("txt|csv|tsv",custom.marker.file)){
custom_filter <- as.numeric(readLines(custom.marker.file))
}else{
stop("Invalid value for custom.marker.file")
}
ind<-intersect(ind, custom_filter)
}
}
if(store.heatmaps){
create.heatmap(meth.data[ind,],trait,sample.cols,palette,out.dir,marker.selection[group])
}
if(write.files){
saveRDS(ind, file=sprintf("%s/cg_group_%d.RDS", out.dir, group))
}
cg.groups[[group]]<-ind
}
names(cg.groups) <- marker.selection
return(cg.groups=cg.groups)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.