#' prepare_CG_subsets
#'
#' This routine selects a subset of CpGs sites used for MeDeCom analysis. Different selection methods are supported.
#'
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#' through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation, sample and optional coverage information.
#' @param MARKER_SELECTION A vector of strings representing marker selection methods. Available method are \itemize{
#' \item{"\code{all}"} Using all sites available in the input.
#' \item{"\code{pheno}"} Selected are the top \code{N_MARKERS} site that differ between the phenotypic
#' groups defined in data preparation or by \code{\link{rnb.sample.groups}}. Those are
#' selected by employing limma on the methylation matrix.
#' \item{"\code{houseman2012}"} The 50k sites reported as cell-type specific in the Houseman's reference-
#' based deconvolution. See Houseman et.al. 2012.
#' \item{"\code{houseman2014}"} Selects the sites said to be linked to cell type composition by \code{RefFreeEWAS},
#' which is similar to surrogate variable analysis. See Houseman et.al. 2014.
#' \item{"\code{jaffe2014}"} The sites stated as related to cell-type composition Jaffe et.al. 2014.
#' \item{"\code{rowFstat}"} Markers are selected as those found to be associated to the reference cell
#' types with F-statistics. If this option is selected, \code{REF_DATA_SET} and \code{REF_PHENO_COLUMN}
#' need to be specified.
#' \item{"\code{random}"} Sites are randomly selected.
#' \item{"\code{pca}"} Sites are selected as those with most influence on the principal components.
#' \item{"\code{var}"} Selects the most variable sites.
#' \item{"\code{hybrid}"} Selects (N_MARKERS/2) most variable and (N_MARKERS/2) random sites.
#' \item{"\code{range}"} Selects the sites with the largest difference between minimum and maximum
#' across samples.
#' \item{"\code{pcadapt}"} Uses principal component analysis as implemented in the \code{"bigstats"}
#' R package to determine sites that are significantly linked to the potential cell types. This
#' requires specifying K a priori (argument \code{K.prior}). We thank Florian Prive and Sophie
#' Achard for providing the idea and parts of the codes.
#' \item{"\code{edec_stage0}} Employs EDec's stage 0 to infer cell-type specific markers. By default
#' EDec's example reference data is provided. If a specific data set is to be provided, it needs
#' to be done through \code{REF_DATA_SET}.
#' \item{"\code{custom}"} Specifying a custom file with indices.
#' }
#' @param N_MARKERS The number of sites to be selected. Defaults to 5000.
#' @param REMOVE_CORRELATED Flag indicating if highly correlated sites are to be removed
#' @param COR_THRESHOLD Numeric indicating a threshold above which sites are not to be considered in the feature selection.
#' If \code{"quantile"}, sites correlated higher than the 95th quantile are removed.
#' @param WRITE_FILES Flag indicating if the selected sites are to be stored on disk.
#' @param WD Path to the working directory used for analyis, or data preparation.
#' @param REF_DATA_SET An object of type \code{\link{RnBSet-class}} or a path to such an object stored on disk,
#' if \code{rowFstat} is selected.
#' @param REF_PHENO_COLUMN Optional argument stating the column name of the phenotypic table of \code{REF_DATA_SET} with
#' the reference cell type.
#' @param N_PRIN_COMP Optional argument deteriming the number of prinicipal components used for selecting the most important sites.
#' @param RANGE_DIFF Optional argument specifying the difference between maximum and minimum required.
#' @param CUSTOM_MARKER_FILE Optional argument containing an absolute path to a file that specifies the indices used for employing MeDeCom. Can be provided
#' either as an \code{RDS} file containing a vector of indices to select or as a \code{txt, csv, tsv} file containing each index
#' to be selected as a single row.
#' @param store.heatmaps Flag indicating if a heatmap of the selected input sites is to be create from the input methylation matrix.
#' The files are then stored in the 'heatmaps' folder in WD.
#' @param heatmap.sample.col Column name in the phenotypic table of \code{rnb.set}, used for creating a color scheme in the heatmap.
#' @param K.prior K determined from visual inspection. Only has an influence, if \code{MARKER_SELECTION="pcadapt"}.
#' @return List of indices, one entry for each marker selection method specified by \code{MARKER_SELECTION}. The indices correspond
#' to the sites that should be used in \code{rnb.set}.
#' @details For methods "\code{houseman2012}" and "\code{jaffe2014}", a predefined set of markers is used. Since those correspond to
#' absolute indices on the chip, the provided \code{rnb.set} must not be preprocessed and therefore still contain all sites.
#' For the other metods, you may used \code{\link{prepare_data}} to filter sites for quality and context.
#' @references \itemize{
#' \item{1.} Houseman, E. A., Accomando, W. P., Koestler, D. C., Christensen, B. C., Marsit, C. J., Nelson, H. H., ..., Kelsey, K.
#' T. (2012). DNA methylation arrays as surrogate measures of cell mixture distribution. BMC Bioinformatics, 13.
#' \item{2.} Houseman, E. A., Molitor, J., & Marsit, C. J. (2014). Reference-free cell mixture adjustments in analysis of
#' DNA methylation data. Bioinformatics, 30(10), 1431-1439. https://doi.org/10.1093/bioinformatics/btu029
#' \item{3.} Jaffe, A. E., & Irizarry, R. A. (2014). Accounting for cellular heterogeneity is critical in epigenome-wide
#' association studies. Genome Biology, 15(2), R31. https://doi.org/10.1186/gb-2014-15-2-r31
#' }
#' @export
prepare_CG_subsets<-function(
meth.data=NULL,
rnb.set=NULL,
MARKER_SELECTION,
N_MARKERS=5000,
REMOVE_CORRELATED=FALSE,
COR_THRESHOLD="quantile",
WRITE_FILES=FALSE,
WD=NA,
REF_DATA_SET=NULL,
REF_PHENO_COLUMN=NULL,
N_PRIN_COMP=10,
RANGE_DIFF=0.05,
CUSTOM_MARKER_FILE="",
store.heatmaps=F,
heatmap.sample.col=NULL,
K.prior=NULL
)
{
require("RnBeads")
cg_groups<-list()
groups<-1:length(MARKER_SELECTION)
if(is.null(meth.data)){
if(inherits(rnb.set,"RnBSet")){
meth.data<-meth(rnb.set)
}else{
stop("Invalid value for rnb.set, needs to be RnBSet")
}
}
if(!is.data.frame(meth.data)&&!is.matrix(meth.data)){
stop("Invalid value for meth.data")
}
if(store.heatmaps){
if(!is.null(heatmap.sample.col)){
if(!heatmap.sample.col %in% colnames(pheno(rnb.set))){
logger.error("heatmap.sample.col not a column name of the phenotypic table")
}
trait <- as.factor(pheno(rnb.set)[,heatmap.sample.col])
palette <- rainbow(length(levels(trait)))
sample.cols <- palette[as.integer(trait)]
}else{
sample.cols <- NULL
}
if(!dir.exists(file.path(WD,"heatmaps"))){
dir.create(file.path(WD,"heatmaps"))
}
}
if(N_MARKERS>nrow(meth.data)){
logger.error("You cannot select more markers than you have sites")
}
ind.all <- 1:nrow(meth.data)
if(REMOVE_CORRELATED){
if(inherits(rnb.set,'RnBSet')){
logger.start("Removing highly correlated features")
if(!isImputed(rnb.set)){
meth.data <- rnb.execute.imputation(meth.data)
}
logger.start("Correlation matrix computation for chromosomes separately")
anno.set <- annotation(rnb.set)
all.cor <- rep(NA,nrow(anno.set))
for(chr in unique(anno.set$Chromosome)){
to.cor <- meth.data[anno.set$Chromosome %in% chr,]
if(any(is.na(to.cor))){
logger.error(paste("Missing values in correlation for chromosome",chr))
}
cor.mat <- abs(cor(t(to.cor)))
diag(cor.mat) <- NA
max.cor <- apply(cor.mat,1,mean,na.rm=T)
all.cor[anno.set$Chromosome %in% chr] <- max.cor
}
logger.completed()
logger.start("removing features")
if(!is.numeric(COR_THRESHOLD)){
if(COR_THRESHOLD != "quantile"){
logger.error("COR_THRESHOLD only allows numeric values and 'quantile'")
}
COR_THRESHOLD <- quantile(all.cor,0.95,na.rm = T)
}
logger.completed()
if(COR_THRESHOLD < 0 | COR_THRESHOLD > 1){
logger.error("COR_THRESHOLD needs to be between 0 and 1")
}
logger.info(paste("COR_THRESHOLD is",COR_THRESHOLD))
logger.info(paste("Removed ",sum(all.cor>COR_THRESHOLD),"sites in correlation filtering"))
ind.all <- intersect(ind.all,which(all.cor<=COR_THRESHOLD))
logger.completed()
}else{
logger.warning("Removing highly correlated features only applicable for 'RnBSet' input")
}
}
for(group in groups){
ind<-ind.all
if(MARKER_SELECTION[group]=="all"){
#Do nothing
ind <- ind
}
if(MARKER_SELECTION[group]=="pheno"){
if(file.exists(sprintf("%s/pheno.RData",WD))){
load(sprintf("%s/pheno.RData",WD))
sel.columns <- gsub(" ","_",colnames(pheno.data))
}else{
pheno.data <- pheno(rnb.set)
sel.columns <- gsub(" ","_",names(rnb.sample.groups(rnb.set)))
}
X <- meth.data
ncgs<-nrow(X)
X <- na.omit(X)
colnames(pheno.data) <- gsub(" ","_",colnames(pheno.data))
na.pheno <- lapply(sel.columns,function(x)is.na(pheno.data[,x]))
rem.samples <- rep(FALSE,ncol(X))
for(entry in na.pheno){
rem.samples <- rem.samples | entry
}
X <- X[,!rem.samples]
formula.text <- paste0("~0+", paste(sel.columns,collapse="+"))
design <- model.matrix(as.formula(formula.text), data=pheno.data)
fit <- limma::lmFit(X,design)
fit <- limma::eBayes(fit)
tstatDeltaAll<-abs(fit$t)
ranks<-apply(tstatDeltaAll, 2, rank)
maxRank<-apply(ranks, 1, max)
ind<-ind[order(maxRank)[1:min(N_MARKERS, length(ind))]]
}
if(MARKER_SELECTION[group]=="houseman2012" ){
loc <- system.file(file.path("extdata","houseman.50k.markers.RDS"),package="DecompPipeline")
if(file.exists(loc)){
houseman.50k.markers<-readRDS(loc)
ind<-intersect(ind, houseman.50k.markers)
}
}
if(MARKER_SELECTION[group]=="houseman2014"){
require(RefFreeEWAS)
if(file.exists(sprintf("%s/pheno.RData",WD))){
load(sprintf("%s/pheno.RData",WD))
sel.columns <- gsub(" ","_",colnames(pheno.data))
}else{
pheno.data <- pheno(rnb.set)
sel.columns <- gsub(" ","_",names(rnb.sample.groups(rnb.set)))
}
X <- meth.data
ncgs<-nrow(X)
X <- na.omit(X)
colnames(pheno.data) <- gsub(" ","_",colnames(pheno.data))
na.pheno <- lapply(sel.columns,function(x)is.na(pheno.data[,x]))
rem.samples <- rep(FALSE,ncol(X))
for(entry in na.pheno){
rem.samples <- rem.samples | entry
}
X <- X[,!rem.samples]
pheno.data <- pheno.data[!rem.samples,]
level.problem <- unlist(lapply(sel.columns,function(x,mat){
length(unique(mat[,x])) == length(levels(mat[,x]))
},pheno.data))
sel.columns <- sel.columns[level.problem]
formula.text <- paste0("~0+", paste(sel.columns,collapse="+"))
design <- model.matrix(as.formula(formula.text), data=pheno.data)
tmpBstar <- (X %*% design %*% solve(t(design)%*%design))
## rescaling the residuals, and addition by Andres
R <- X-tmpBstar %*% t(design)
rescale.residual<-TRUE
if(rescale.residual){
R <- t(scale(t(R)))
}
d<-EstDimRMT(R)$dim
logger.info(c("Estimated number of latent components is", d))
rf <- RefFreeEwasModel(X, design, d)
logger.status("Fitted the RefFreeEWAS model")
Delta <- rf$Bstar-rf$Beta
nboot<-100
#### compute se delta from a bootstrap
rfBoot <- BootRefFreeEwasModel(rf,nboot)
logger.status("Pefrormed the bootstrap")
seDelta <- apply(rfBoot[,,"B*",]-rfBoot[,,"B",], 1:2, sd)
tstatDelta <- -abs(Delta)/seDelta
if(FALSE){
for(id in 1:10){
system(sprintf("qsub -cwd -j y -o %s/rf_boot_%d.log -b y -V -N rfb_%d_%s -l h='%s' -l mem_free=%s %s/Rscript %s/rfEwasBoot.R %s %d", WD, id, id, ANALYSIS, HOSTLIST, MEMLIMIT, ANALYSIS, RDIR, SRCDIR, WD, id))
}
# load all bootstraps
load("rfBoot-1.RData")
rfBootGrandSum <- rfBoot.sum
rfBootGrandSum2 <- rfBoot.sum2
for(i in 2:10){ # Now combine
load(file.path(WD, "rfBoot-",i,".RData",sep=""))
rfBootGrandSum <- rfBootGrandSum + rfBoot.sum
rfBootGrandSum2 <- rfBootGrandSum2 + rfBoot.sum2
}
# Calculate SE's from summary data
rfBootGrandMean <- rfBootGrandSum/100
rfBootGrandMean2 <- rfBootGrandSum2/100
rfBootSE <- sqrt((100/99)*(rfBootGrandMean2-rfBootGrandMean*rfBootGrandMean))
}
ncgs <- nrow(meth.data)
nnas <- apply(is.na(meth.data),1,sum)
notna.rows <- which(nnas==0)
if(is.null(dim(tstatDelta))){
tstatDeltaAll<-rep(NA, ncgs)
tstatDeltaAll[notna.rows]<-tstatDelta
}else{
tstatDeltaAll<-matrix(NA_real_, ncol=ncol(tstatDelta), nrow=ncgs)
tstatDeltaAll[notna.rows,]<-tstatDelta
}
if(is.null(dim(tstatDelta))){
ind<-ind[order(-tstatDeltaAll)[1:min(10000, length(ind))]]
}else{
ranks<-apply(-tstatDeltaAll, 2, rank)
maxRank<-apply(ranks, 1, max)
ind<-ind[order(maxRank)[1:min(N_MARKERS, length(ind))]]
}
}
if(MARKER_SELECTION[group]=="jaffe2014"){
jaffe.markers <- readRDS(system.file(file.path("extdata","jaffe.irrizzary.markers.600.RDS"),package="DecompPipeline"))
ind<-intersect(ind, jaffe.markers)
}
if(MARKER_SELECTION[group]==("rowFstat")){
####### This needs to be checked: it doesn't seem to do what it should
if(!(is.null(REF_DATA_SET) || is.null(REF_PHENO_COLUMN))){
require(genefilter)
load.env<-new.env(parent=emptyenv())
if(inherits(REF_DATA_SET,"RnBSet")){
rnb.ref.set <- REF_DATA_SET
}else{
rnb.ref.set <- load.rnb.set(REF_DATA_SET)
}
meth.data.ref <- meth(rnb.ref.set)
pheno.data.ref <- pheno(rnb.ref.set)
marker.fstat<-rowFtests(meth.data.ref[ind,], as.factor(pheno.data.ref[[REF_PHENO_COLUMN]]))
subset<-which(rank(-marker.fstat$statistic)<=N_MARKERS)
ind<-ind[subset]
}else{
logger.error("REF_DATA_SET and REF_PHENO_COLUMN need to be specified, if rowFstat is selected.")
}
}
if(MARKER_SELECTION[group]=="random"){
subset<-sample.int(length(ind), min(N_MARKERS, length(ind)))
ind<-ind[subset]
}
if(MARKER_SELECTION[group]=="pca"){
meth.data.subs<-meth.data[ind,]
pca<-prcomp(t(meth.data.subs))
rot<-pca$rotation[,1:min(N_PRIN_COMP,ncol(pca$rotation))]
pca.ind<-integer()
add.sites <- ceiling(N_MARKERS/N_PRIN_COMP)
for(cix in 1:min(N_PRIN_COMP,ncol(pca$rotation))){
pca.ind<-union(pca.ind,order(abs(rot[,cix]), decreasing = TRUE)[1:add.sites])
}
ind<-ind[sort(pca.ind)]
}
if(MARKER_SELECTION[group] == "var"){
sds<-apply(meth.data[ind,], 1, sd)
ind<-ind[order(sds, decreasing=TRUE)[1:min(length(ind),N_MARKERS)]]
}
if(MARKER_SELECTION[group] == "hybrid"){
sds<-apply(meth.data[ind,], 1, sd)
var.set<-ind[order(sds, decreasing=TRUE)[1:min(length(ind),ceiling(N_MARKERS/2))]]
random.set<-sample(setdiff(ind,var.set), floor(N_MARKERS/2))
ind<-sort(c(var.set, random.set))
rm(var.set)
rm(random.set)
}
if(MARKER_SELECTION[group] == "range"){
ranges<-apply(meth.data[ind,], 1, range)
ranges<-ranges[2,]-ranges[1,]
ind<-ind[ranges>RANGE_DIFF]
rm(ranges)
}
if(MARKER_SELECTION[group] == "pcadapt"){
require("bigstatsr")
require("robust")
if(is.null(K.prior)){
stop("K.prior needs to be specific for pcadapat marker selection method")
}
meth.t <- t(meth.data[ind,])
meth.t <- as_FBM(meth.t)
svd <- big_SVD(meth.t,fun.scaling = big_scale(),k=K.prior)
singular.vectors <- svd$u
z.scores <- sapply(cols_along(singular.vectors), function(k) {
big_univLinReg(meth.t, singular.vectors[, k])$score
})
dist.math <- robust::covRob(z.scores, estim = "pairwiseGK")$dist
lpval <- pchisq(dist.math, df = K.prior, lower.tail = FALSE, log.p = TRUE) / log(10)
ind <- ind[order(lpval,decreasing = F)[1:N_MARKERS]]
}
if(MARKER_SELECTION[group]=="edec_stage0"){
require("EDec")
require("EDecExampleData")
if(is.null(REF_DATA_SET)){
markers <- run_edec_stage_0(reference_meth = EDecExampleData::reference_meth,
reference_classes = EDecExampleData::reference_meth_class,
max_p_value = 1e-5,
num_markers = N_MARKERS)
}else{
if(inherits(REF_DATA_SET,"RnBSet")){
rnb.ref.set <- REF_DATA_SET
}else{
rnb.ref.set <- load.rnb.set(REF_DATA_SET)
}
if(!REF_PHENO_COLUMN %in% colnames(pheno(rnb.ref.set))){
stop("Supplied REF_PHENO_COLUMN not in phenotypic information")
}
markers <- run_edec_stage_0(reference_meth = meth(rnb.ref.set),
reference_classes = pheno(rnb.ref.set)[,REF_PHENO_COLUMN],
max_p_value = 1e-5,
num_markers = N_MARKERS)
}
if(is.null(rnb.set)){
if(!grep("cg",row.names(meth.data))){
stop("Row names of meth.data (cg-identifiers) need to be provided for EDec")
}
ind <- which(row.names(meth.data) %in% markers)
}else{
ind <- which(row.names(annotation(rnb.set)) %in% markers)
}
}
if(MARKER_SELECTION[group]=="custom"){
if(file.exists(CUSTOM_MARKER_FILE)){
if(grepl("RDS",CUSTOM_MARKER_FILE)){
custom_filter<-readRDS(CUSTOM_MARKER_FILE)
}else if(grepl("txt|csv|tsv",CUSTOM_MARKER_FILE)){
custom_filter <- as.numeric(readLines(CUSTOM_MARKER_FILE))
}else{
stop("Invalid value for CUSTOM_MARKER_FILE")
}
ind<-intersect(ind, custom_filter)
}
}
if(store.heatmaps){
create.heatmap(meth.data[ind,],trait,sample.cols,palette,WD,MARKER_SELECTION[group])
}
if(WRITE_FILES){
saveRDS(ind, file=sprintf("%s/cg_group_%d.RDS", WD, group))
}
cg_groups[[group]]<-ind
}
names(cg_groups) <- MARKER_SELECTION
return(cg_groups=cg_groups)
}
#' create.heatmap
#'
#' This function create a heatmap of methylation values with the colors specified according to the trait.
#'
#' @param meth.data The methylation data to be plotted
#' @param trait A column in the phenotypic table specifying a grouping.
#' @param sample.cols A vector of colors specifying the grouping in \code{trait}.
#' @param WD The working directory, in which the plots are to be added in a subfolder "heatmaps".
#' @param palette The color palette from which \code{sample.cols} was obtained.
#' @param sel.type The method used for selecting subsets of sites.
#' @author Michael Scherer
#' @noRd
create.heatmap <- function(meth.data,
trait,
sample.cols,
palette,
WD,
sel.type){
if(!is.null(sample.cols)){
png(file.path(WD,"heatmaps",paste0(sel.type,".png")))
heatmap.2(meth.data,
trace="none",
ColSideColors=sample.cols
)
legend(x=0,y=1,levels(trait),col=palette,pch=15)
dev.off()
}else{
png(file.path(WD,"heatmaps",paste0(sel.type,".png")))
heatmap.2(meth.data,
trace="none"
)
dev.off()
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.