R/start_analysis.R

Defines functions start.analysis start.edec.analysis start.refreeewas.analysis start_medecom_analysis start_decomp_pipeline

Documented in start.analysis start_decomp_pipeline start.edec.analysis start_medecom_analysis start.refreeewas.analysis

#' start.analysis
#' 
#' Wrapper function to start one of the deconvolution algorithms \code{MeDeCom}, \code{RefFreeEWAS} or \code{EDec}
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param work.dir Working directory for the analysis.
#' @param cg_groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare_CG_subsets}}.
#' @param Ks Vector of integers used as components in MeDeCom.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @param method The method to be used for deconvolution. Can be one of \code{MeDeCom}, \code{RefFreeCellMix} or \code{EDec}.
#' @author Michael Scherer
#' @export
start.analysis <- function(meth.data=NULL,
                           rnb.set=NULL,
                           cg_groups,
                           Ks,
                           work.dir,
                           factorviz.outputs=FALSE,
                           method="MeDeCom",
                           ...){
  all.methods <- c("MeDeCom","RefFreeCellMix","EDec")
  if(!method %in% all.methods){
    stop(paste0("Invalid value for method. Needs to be one of ",all.methods))
  }
  if(method == "MeDeCom"){
    md.res <- start_medecom_analysis(meth.data = meth.data,
                                     rnb.set = rnb.set,
                                     cg_groups = cg_groups,
                                     Ks = Ks,
                                     WORK_DIR = work.dir,
                                     factorviz.outputs = factorviz.outputs,
                                     ...)
  }else if(method == "RefFreeCellMix"){
    md.res <- start.refreeewas.analysis(meth.data=meth.data,
                                                    rnb.set=rnb.set,
                                                    cg_groups=cg_groups,
                                                    Ks=Ks,
                                                    work.dir=work.dir,
                                                    factorviz.outputs=factorviz.outputs)
  }else if(method == "EDec"){
    md.res <- start.edec.analysis(meth.data=meth.data,
                                  rnb.set=rnb.set,
                                  cg_groups=cg_groups,
                                  Ks=Ks,
                                  work.dir = work.dir,
                                  factorviz.outputs = factorviz.outputs)
  }
  return(md.res)
}

#' start.edec.analysis
#' 
#' This function executes EDec for the specified CpGs and the number of cell types K.
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param cg_groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare_CG_subsets}}.
#' @param Ks The number of cell types to be tested. Can be a single numeric value or an array of numbers.
#' @param work.dir The working directory to be used.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the EDec experiment.
#' @author Michael Scherer
#' @export
start.edec.analysis <- function(meth.data=NULL,
                                rnb.set=NULL,
                                cg_groups,
                                Ks,
                                work.dir=getwd(),
                                factorviz.outputs=FALSE){
  if(is.null(meth.data) && is.null(rnb.set)){
    logger.error("No input methylation data provided")
  }
  if(is.null(meth.data)){
    if(inherits(rnb.set,"RnBSet")){
      meth.data <- meth(rnb.set,row.names=T)
    }else{
      logger.error("Invalid value for rnb.set")
    }
  }else if(!(grepl("cg",row.names(meth.data)))){
    stop("Rownames of methylation data need to be provided for EDec")
  }
  require("EDec")
  T.all <- list()
  A.all <- list()
  rss.all <- list()
  res.all <- list()
  for(i.group in 1:length(cg_groups)){
    logger.start(paste("Processing group:",i.group))
    group <- cg_groups[[i.group]]
    group <- row.names(meth.data)[group]
    rss.vec <- c()
    T.all <- list()
    A.all <- list()
    for(j.K in 1:length(Ks)){
      K <- Ks[j.K]
      logger.start(paste("Processing K:",K))
      edec.res <- run_edec_stage_1(meth.data,
                                   informative_loci = group,
                                   num_cell_types = K)
      rss.vec <- c(rss.vec,edec.res$res.sum.squares)
      T.all[[j.K]] <- edec.res$methylation
      A.all[[j.K]] <- t(edec.res$proportions)
      logger.completed()
    }
    rss.all[[i.group]] <- rss.vec
    res.all[[i.group]] <- list(T=T.all,A=A.all)
    logger.completed()
  }
  result <- as.MeDeComSet(res.all,cg_subsets=1:length(cg_groups),Ks=Ks,rss=rss.all,m.orig=nrow(meth.data),n.orig=ncol(meth.data))
  result@parameters$GROUP_LISTS <- cg_groups
  if(factorviz.outputs){
    store.path <- file.path(work.dir,"FactorViz_outputs")
    if(!file.exists(store.path)){
      dir.create(store.path)
    }
    if(!is.null(rnb.set)){
      result@parameters$ASSEMBLY <- assembly(rnb.set)
      ann.C <- annotation(rnb.set)
      ann.S <- pheno(rnb.set)
      save(ann.C,file=file.path(store.path,"ann_C.RData"))
      save(ann.S,file=file.path(store.path,"ann_S.RData"))
    }
    medecom.set <- result
    save(medecom.set,file=file.path(store.path,"medecom_set.RData"))
    save(meth.data,file=file.path(store.path,"meth_data.RData"))
  }
  return(result)
}

#' start.refreeewas.analysis
#' 
#' This function executes RefFreeCellMix for the specified CpGs and the number of cell types K.
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param cg_groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare_CG_subsets}}.
#' @param Ks The number of cell types to be tested. Can be a single numeric value or an array of numbers.
#' @param work.dir The working directory to be used.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the RefFreeCellMix experiment.
#' @author Michael Scherer
#' @export
start.refreeewas.analysis <- function(meth.data=NULL,
                                      rnb.set=NULL,
                                      cg_groups,
                                      Ks,
                                      work.dir=getwd(),
                                      factorviz.outputs=FALSE){
  if(is.null(meth.data) && is.null(rnb.set)){
    logger.error("No input methylation data provided")
  }
  if(is.null(meth.data)){
    if(inherits(rnb.set,"RnBSet")){
      meth.data <- meth(rnb.set)
    }else{
      logger.error("Invalid value for rnb.set")
    }
  }
  res.all <- list()
  devis.all <- list()
  for(i.group in 1:length(cg_groups)){
    logger.start(paste("Processing group:",i.group))
    group <- cg_groups[[i.group]]
    meth.sset <- meth.data[group,]
    res.sset <- RefFreeCellMixArray(meth.sset,Klist=Ks)
    devis <- tryCatch(RefFreeCellMixArrayDevianceBoots(res.sset,Y=meth.sset),error=function(e)e)
    if(inherits(devis,"error")){
      devis <- rep(NA,length(Ks))
    }else{
      devis <- colMeans(devis)
    }
    devis.all[[i.group]] <- devis
    res.all[[i.group]] <- res.sset
    logger.completed()
  }
  result <- as.MeDeComSet(res.all,cg_subsets=1:length(cg_groups),Ks=Ks,deviances=devis.all,m.orig=nrow(meth.data),n.orig=ncol(meth.data))
  result@parameters$GROUP_LISTS <- cg_groups
  if(factorviz.outputs){
    store.path <- file.path(work.dir,"FactorViz_outputs")
    if(!file.exists(store.path)){
      dir.create(store.path)
    }
    if(!is.null(rnb.set)){
      result@parameters$ASSEMBLY <- assembly(rnb.set)
      ann.C <- annotation(rnb.set)
      ann.S <- pheno(rnb.set)
      save(ann.C,file=file.path(store.path,"ann_C.RData"))
      save(ann.S,file=file.path(store.path,"ann_S.RData"))
    }
    medecom.set <- result
    save(medecom.set,file=file.path(store.path,"medecom_set.RData"))
    save(meth.data,file=file.path(store.path,"meth_data.RData"))
  }
  return(result)
}

#' start_medecom_analysis
#' 
#' Wrapper for runMeDeCom, for data preprocessed through the DecompPipeline
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param WORK_DIR Working directory for the analysis.
#' @param cg_groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare_CG_subsets}}.
#' @param Ks Vector of integers used as components in MeDeCom.
#' @param LAMBDA_GRID Vector of doubles representing the regularization parameter in MeDeCom.
#' @param SAMPLE_SUBSET Vector of indices of samples to be included in the analysis. If \code{NULL}, all samples are included.
#' @param K_FIXED Columns in the T matrix that should be fixed. If \code{NULL}, no columns are fixed.
#' @param WRITE_FILES Flag indicating if intermediate results are to be stored.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @param opt.method Optimization method to be used. Either MeDeCom.quadPen or MeDeCom.cppTAfact (default).
#' @param startT Inital matrix for T.
#' @param startA Initial matrix for A.
#' @param trueT True value for the T matrix.
#' @param trueA True value for the A matrix.
#' @param analysis.name Name of the analysis.
#' @param folds Integer representing the number of folds used in the analysis.
#' @param cores Integer representing the number of cores to be used in the analysis.
#' @param itermax Maximum number of iterations
#' @param ninit Number if initialtions.
#' @param CLUSTER_SUBMIT Flag indicating, if the jobs are to be submitted to a scientific compute cluster (only SGE supported).
#' @param CLUSTER_RDIR Path to an executable version of R.
#' @param CLUSTER_HOSTLIST Regular expression, on which basis hosts are selected in the cluster environment.
#' @param CLUSTER_MEMLIMIT the \code{memlimit} resource value of the cluster submission.
#' @param CLEANUP Flag indicating if temprary files are to be deleted.
#' @param analysis_info Information to be saved about the analysis. Just stored as info.
#' @param LAMBDA_GRID_TYPE String represent the lambda grid that was chosen. Just stored as info.
#' @param ANALYSIS_TOKEN String specifying the type of analysis that was conducted. Just stored as info.
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the MeDeCom experiment.
#' @export
start_medecom_analysis<-function(
    meth.data=NULL,
		rnb.set=NULL,
		WORK_DIR=getwd(),
		cg_groups,
		Ks,
		LAMBDA_GRID,
		SAMPLE_SUBSET=NULL,
		K_FIXED=NULL,
		WRITE_FILES=TRUE,
		factorviz.outputs=F,
		opt.method = "MeDeCom.cppTAfact",
		startT=NULL,
		startA=NULL,
		trueT = NULL,
		trueA = NULL,
		analysis.name="MeDeComRun",
		folds=10,
		cores=1,
		itermax=1000,
		ninit=100,
		CLUSTER_SUBMIT=FALSE,
		CLUSTER_RDIR=NA,
		CLUSTER_HOSTLIST="*",
		CLUSTER_MEMLIMIT="5G",
#		MAX_JOBS=1000,
#		WAIT_TIME="30m",
#		PORTIONS=FALSE,
#		JOB_FILE=NA,
		CLEANUP=FALSE,
		analysis_info=NULL,
		LAMBDA_GRID_TYPE="standard",
		ANALYSIS_TOKEN="customAnalysis"
){
	library(MeDeCom)
	library(R.utils)
	
	#RDIR="/TL/deep-share/archive00/software/bin"
	#.libPaths(sprintf("%s/Rlib_test", WORK_DIR))
  
  if(!is.null(analysis_info)){
    ANALYSIS_ID<-paste(
      analysis_info$DATASET, 
      analysis_info$DATA_SUBSET,
      analysis_info$NORMALIZATION,
      analysis_info$QUALITY_FILTERING, 
      analysis_info$MARKER_SELECTION,
      LAMBDA_GRID_TYPE,
      ANALYSIS_TOKEN,
      sep="_")
  }else{
    ANALYSIS_ID<-"customAnalysis"
    analysis_info<-list()
  }
  analysis_info$ANALYSIS<-ANALYSIS_ID
  
  WORK_DIR <- file.path(WORK_DIR,analysis.name)
  if(!file.exists(WORK_DIR)){
    dir.create(WORK_DIR)
  }
  log.file <- file.path(WORK_DIR,"analysis.log")
  if(!file.exists(log.file)){
    if(logger.isinitialized()){
      logger.close()
      logger.start(fname=log.file)
    }else{
      logger.start(fname=log.file)
    }
  }
	
	if(is.na(CLUSTER_RDIR)){
		if(any(grepl("deep", R.utils:::System$getHostname()))){
			#RDIR="/TL/deep-share/archive00/software/packages/R/R-3.1.1/bin"
			CLUSTER_RDIR="/TL/deep-share/archive00/software/packages/R/R-devel_20160307/bin"
		}else if(any(grepl("t7600", R.utils:::System$getHostname()))){
			CLUSTER_RDIR="/usr/bin/"
		}else{
			CLUSTER_RDIR="/usr/bin/"
		}
	}
	
#	SRCDIR=sprintf("%s/projects/parameter_tuning/src", WORK_DIR)
#	DATA_DIR=sprintf("%s/projects/parameter_tuning/data", WORK_DIR)
#	GLOBAL_DD<-sprintf("%s/projects/parameter_tuning/data/common", WORK_DIR)
	
	if(TRUE){
		print("Did not write the variable dump: should only be executed from an environment with all the variables set")	
	}else{
		var_list<-c(ANALYSIS)
		system(sprintf("mkdir %s", WORK_DIR))
		dump(ls()[var_list], file=file.path(WORK_DIR, "analysis_settings.RDump"))
	}
	
  if(is.null(meth.data)){
  	if(is.null(rnb.set)){
  		load(sprintf("%s/data.set.RData", WORK_DIR))
  	}else{
  		meth.data<-meth(rnb.set)
  	}
  }
	
#	if(!DATASET %in% c("sim", "simReal")){
#		load(sprintf("%s/indices.RData", GLOBAL_DD))
#		load(sprintf("%s/snp.filter.RData", GLOBAL_DD))
#		load(sprintf("%s/somatic.filter.RData", GLOBAL_DD))
#	}
	
#	if(file.exists(sprintf("%s/quality.filter.RData", DD))){
#		load(sprintf("%s/quality.filter.RData", DD))
#	}
	
#	if(file.exists(sprintf("%s/Mint.RDS", DD))){
#		M.raw<-readRDS(sprintf("%s/Mint.RDS", DD))
#	}else{
#		M.raw<-NULL
#	}
	
#	if(file.exists(sprintf("%s/Uint.RDS", DD))){
#		U.raw<-readRDS(sprintf("%s/Uint.RDS", DD))
#	}else{
#		U.raw<-NULL
#	}
	
#	if(file.exists(sprintf("%s/Nbeads.RDS", DD))){
#		b.raw<-readRDS(sprintf("%s/Nbeads.RDS", DD))
#	}else{
#		b.raw<-NULL
#	}
	
	if(WRITE_FILES){
		saveRDS(LAMBDA_GRID, file=sprintf("%s/lambda_grid.RDS", WORK_DIR))
		if(!is.null(SAMPLE_SUBSET)){
			saveRDS(SAMPLE_SUBSET, file=sprintf("%s/sample_subset.RDS", WORK_DIR))
		}else{
			SAMPLE_SUBSET<-1:ncol(meth.data)
		}
	}
	
	groups<-1:length(cg_groups)
	
#	if(QUALITY_FILTERING %in% c("standard","MplusU") && !is.null(M.raw) && !is.null(U.raw)){
#		
#		if(!"MIN_INT_QUANT" %in% ls()){
#			MIN_INT_QUANT<-0.1
#		}
#		if(!"MAX_INT_QUANT" %in% ls()){
#			MAX_INT_QUANT<-0.95
#		}
#		if(!"MIN_N_BEADS" %in% ls()){
#			MIN_N_BEADS<-3
#		}
#		
#		MplusU<-M.raw+U.raw
#		
#		hm450_ann<-readRDS(file.path(GLOBAL_DD, "hm450_cg_annot.RDS"))
#		
#		MplusU.I<-MplusU[hm450_ann$Design=="I",]
#		MplusU.II<-MplusU[hm450_ann$Design=="II",]
#		
#		MU.q001.I<-sort(as.numeric(MplusU.I))[ceiling(MIN_INT_QUANT*nrow(MplusU.I)*ncol(MplusU.I))]
#		MU.q099.I<-sort(as.numeric(MplusU.I))[ceiling(MAX_INT_QUANT*nrow(MplusU.I)*ncol(MplusU.I))]
#		
#		MU.q001.II<-sort(as.numeric(MplusU.II))[ceiling(MIN_INT_QUANT*nrow(MplusU.II)*ncol(MplusU.II))]
#		MU.q099.II<-sort(as.numeric(MplusU.II))[ceiling(MAX_INT_QUANT*nrow(MplusU.II)*ncol(MplusU.II))]
#		
#		MplusU.f<-matrix(FALSE, nrow=nrow(MplusU), ncol=ncol(MplusU))
#		
#		MplusU.f[hm450_ann$Design=="I",]<-MplusU.I>MU.q001.I & MplusU.I<MU.q099.I
#		MplusU.f[hm450_ann$Design=="II",]<-MplusU.II>MU.q001.II & MplusU.II<MU.q099.II
#		
#		qf.MU<-which(rowSums(MplusU.f)==ncol(MplusU.f))
#		
#		if(!is.null(b.raw)){
#			qf.b<-which(rowSums(b.raw>=MIN_N_BEADS)==ncol(b.raw))
#			qf.MU.beads<-intersect(qf.MU, qf.b)
#		}else{
#			qf.MU.beads<-qf.MU
#		}
#		
#	}
	
#	for(group in groups){
#		
#		# LOAD THE RESULTS
#		#if(DATASET %in% c("sim", "simReal")){
#		ind<-1:nrow(meth.data)	
#		#}else{
#		#	ind<-sort(unlist(indices[GROUP_LISTS[[group]]]))
#		#}
#		
#		if(QUALITY_FILTERING=="standard"){
#			ind<-intersect(ind, qf.MU.beads)
#			ind<-intersect(ind, snp.filter)
#			ind<-intersect(ind, somatic.filter)
#		}else if(QUALITY_FILTERING=="snpANDsomatic"){
#			ind<-intersect(ind, snp.filter)
#			ind<-intersect(ind, somatic.filter)
#		}else if(QUALITY_FILTERING=="somatic"){
#			ind<-intersect(ind, somatic.filter)
#		}else if(QUALITY_FILTERING=="snp"){
#			ind<-intersect(ind, snp.filter)
#		}else if(QUALITY_FILTERING=="MplusU"){
#			ind<-intersect(ind, qf.MU.beads)
#		}
#		
#	}
	if(!is.null(K_FIXED)){
		saveRDS(K_FIXED, file=sprintf("%s/fixed_T_cols.RDS", WORK_DIR))
	}else{
		K_FIXED<-NULL
	}
	
#	if("START" %in% ls() && file.exists(START)){
#		system(sprintf("cp %s %s/start.RData", START, WORK_DIR))
#		load(file.path(WORK_DIR, START))
#		startT=result$T
#		startA=result$A
#	}else{
#		startT=NULL
#		startA=NULL
#	}
	
	if(!is.null(trueT)){
	  if(is.character(trueT)){
	    if(!file.exists(trueT)){
	      logger.error(paste("File for trueT",trueT,"does not exist."))
	    }else{
		    load(trueT)
	    }
	  }else if(!is.matrix(trueT)){
	    logger.error("Invalid value for trueT")
	  }
	}
	
	if(!is.null(trueA)){
	  if(is.character(trueA)){
	    if(!file.exists(trueA)){
	      logger.error(paste("File for trueA",trueA,"does not exist."))
	    }else{
	      load(trueA)
	    }
	  }else if(!is.matrix(trueA)){
	    logger.error("Invalid value for trueA")
	  }
	}
#	if(PORTIONS && is.na(JOB_FILE)){
#		JOB_FILE<-"/tmp/job_file"
#	}
#	cluster_submit<-function(qsub_string, portions=PORTIONS, job_script_file=JOB_FILE){
#		
#		if(portions){
#			cat(qsub_string, file=JOB_FILE, append=TRUE, sep="\n")
#		}else{
#			system(qsub_string)
#			#print(qsub_string)
#		}
#		
#	}
	
	if(CLUSTER_SUBMIT){
		cluster.settings=list(R_bin_dir=CLUSTER_RDIR, host_pattern=CLUSTER_HOSTLIST, mem_limit=CLUSTER_MEMLIMIT)
	}else{
		cluster.settings=NULL
	}
	
	result<-runMeDeCom(data=meth.data, 
			Ks=Ks,
			lambdas=LAMBDA_GRID,
			opt.method=opt.method,
			cg_subsets=cg_groups,
			sample_subset=SAMPLE_SUBSET,
			startT=startT,
			startA=startA,
			trueT=trueT,
			trueA=trueA,
			fixed_T_cols=K_FIXED,
			NINIT=ninit, 
			ITERMAX=itermax, 
			NFOLDS=folds,
			N_COMP_LAMBDA=4,
			NCORES=cores,
			analysis.name=analysis.name,
			use.ff=FALSE,
			cluster.settings=cluster.settings,
			temp.dir=WORK_DIR,
			cleanup=CLEANUP,
			verbosity=1L,
			time.stamps=TRUE
			#,random.seed=RANDOM_SEED
	)
	
	### TODO: Fix this once
	
	result@parameters$ANALYSIS <- analysis.name
	result@parameters$GROUP_LISTS <- cg_groups
	# result@parameters$cg_subsets <- c(1:length(cg_groups))
	# result@parameters$ANALYSIS <- ANALYSIS_ID
	# result@parameters$NORMALIZATION <- NORMALIZATION
	result@parameters$ITERMAX<-itermax
	#    result@parameters$MARKER_SELECTION<- MARKER_SELECTION
	result@parameters$NFOLDS<-folds
	#    result@parameters$ANALYSIS_TOKEN<-""
	result@parameters$NINIT<-ninit
	#    result@parameters$DATASET<-DATASET 
	#    result@parameters$DATA_SUBSET<-DATA_SUBSET 
	
	analysis_info$GROUP_LISTS<-cg_groups
	analysis_info$ANALYSIS_DATE<-date()
	result@dataset_info<-c(result@dataset_info, analysis_info)
	
	if(WRITE_FILES){
		saveRDS(result, file=file.path(WORK_DIR, "collected.result.RDS"))
	}
	
	if(factorviz.outputs){
	  store.path <- file.path(WORK_DIR,"FactorViz_outputs")
	  if(!file.exists(store.path)){
	    dir.create(store.path)
	  }
	  if(!is.null(rnb.set)){
	    result@parameters$ASSEMBLY <- assembly(rnb.set)
	    ann.C <- annotation(rnb.set)
	    ann.S <- pheno(rnb.set)
	    save(ann.C,file=file.path(store.path,"ann_C.RData"))
	    save(ann.S,file=file.path(store.path,"ann_S.RData"))
	  }
	  medecom.set <- result
	  save(medecom.set,file=file.path(store.path,"medecom_set.RData"))
	  save(meth.data,file=file.path(store.path,"meth_data.RData"))
	  if(!is.null(trueT)){
	    ref.meth <- trueT
	    save(ref.meth,file=file.path(store.path,"ref_meth.RData"))
	  }
	  if(!is.null(trueA)){
	    ref.props <- trueA
	    save(ref.props,file=file.path(store.path,"ref_props.RData"))
	  }
	}
	return(result)
}

#' start_decomp_pipeline
#' 
#' Main workhorse of the DecompPipeline R-package. Performs preprocessing (\code{\link{prepare_data}} or \code{\link{prepare_data_BS}}),
#' CpG subset selection (\code{\link{prepare_CG_subsets}}) and deconvolution (\code{\link{start_medecom_analysis}}, \code{\link{start.refreeewas.analysis}}, \code{\link{start.edec.analysis}})
#' 
#' @param rnb.set An object of type \code{\link[RnBeads]{RnBSet-class}} for which analysis is to be performed.
#' @param Ks Vector of integers used as components in MeDeCom.
#' @param lambda.grid Vector of doubles representing the regularization parameter in MeDeCom.
#' @param work.dir A path to a existing directory, in which the results are to be stored
#' @param analysis.name A string representing the dataset for which analysis is to be performed. Only used to create a folder with a 
#'                 descriptive name of the analysis.
#' @param sample.selection.col A column name in the phenotypic table of \code{RNB_SET} used to selected a subset of samples for
#'                 analysis that contain the string given in \code{SAMPLE_SELECTION_GREP}.
#' @param sample.selection.grep A string used for selecting samples in the column \code{SAMPLE_SELECTION_COL}.
#' @param pheno.cols Vector of column names in the phenotypic table of \code{RNB_SET} that is kept and exported for further 
#'                 exploration.
#' @param id.column Sample-specific ID column name in \code{RNB_SET}
#' @param normalization Normalization method to be performed before employing MeDeCom. Can be one of \code{"none","dasen","illumina","noob"} (BeadChip only).
#' @param ref.ct.column Column name in \code{RNB_SET} used to extract methylation information on the reference cell types.
#' @param ref.rnb.set An object of type \code{\link[RnBeads]{RnBSet-class}} containing methylation information on reference cell types (BeadChip only).
#' @param ref.rnb.ct.column Column name in \code{REF_RNB_SET} used to extract methylation information on the reference cell types (BeadChip only).
#' @param prepare.true.proportions Flag indicating if true proportions are either available in \code{RNB_SET} or to be estimated 
#'                          with Houseman's reference-based deconvolution approach (BeadChip only).
#' @param true.A.token String present in the column names of \code{RNB_SET} used for selecting the true proportions of the corresponding
#'                      cell types.
#' @param houseman.A.token Similar to \code{TRUE_A_TOKEN}, but not containing the true proportions, rather the estimated proportions
#'                      by Houseman's method (BeadChip only).
#' @param estimate.houseman.prop If neither \code{TRUE_A_TOKEN} nor \code{HOUSEMAN_A_TOKEN} are given, the proportions of the reference
#'                      cell type are estimated with Houseman's approach (BeadChip only).
#' @param filter.beads Flag indicating, if site-filtering based on the number of beads available is to be conducted (BeadChip only).
#' @param min.n.beads Minimum number of beads required in each sample for the site to be considered for adding to MeDeCom (BeadChip only).
#' @param filter.intensity  Flag indicating if sites should be removed according to the signal intensities (the lowest and highest quantiles
#'                      given by \code{MIN_INT_QUANT} and \code{MAX_INT_QUANT}) (BeadChip only).
#' @param min.int.quant Lower quantile of intensities which is to be removed (BeadChip only).
#' @param max.int.quant Upper quantile of intensities which is to be removed (BeadChip only).
#' @param filter.na Flag indicating if sites with any missing values are to be removed or not.
#' @param filter.context Flag indicating if only CG probes are to be kept (BeadChip only).
#' @param filter.cross.reactive Flag indicating if sites showing cross reactivity on the array are to be removed.
#' @param execute.lump Flag indicating if the LUMP algorithm is to be used for estimating the amount of immune cells in a particular sample.
#' @param remove.ICA Flag indicating if independent component analysis is to be executed to remove potential confounding factor.
#'             If \code{TRUE},conf.fact.ICA needs to be specified.
#' @param conf.fact.ICA Column name in the sample annotation sheet representing a potential confounding factor.
#' @param ica.setting Optional argument setting up ICA.
#' @param filter.snp Flag indicating if annotated SNPs are to be removed from the list of sites according to RnBeads' SNP list. (@TODO: we
#'                     could provide an addititional list of SNPs, similar to RnBeads blacklist for filtering)
#' @param snp.list Path to a file containing CpG IDs of known SNPs to be removed from the analysis, if \code{FILTER_SNP} is \code{TRUE}.
#' @param filter.somatic Flag indicating if only somatic probes are to be kept.
#' CPG FILTERING (BS)
#' @param filter.coverage Flag indicating, if site-filtering based on coverage is to be conducted (BS only).
#' @param min.coverage Minimum number of reads required in each sample for the site to be considered for adding to MeDeCom (BS only).
#' @param min.covg.quant Lower quantile of coverages. Values lower than this value will be ignored for analysis (BS only).
#' @param max.covg.quant Upper quantile of coverages. Values higher than this value will be ignored for analysis (BS only).
#' CG_SUBSET SELECTION
#' @param marker.selection A vector of strings representing marker selection methods. Available method are \itemize{
#'                                  \item{"\code{all}"} Using all sites available in the input.
#'                                  \item{"\code{pheno}"} Selected are the top \code{N_MARKERS} site that differ between the phenotypic
#'                                         groups defined in data preparation or by \code{\link{rnb.sample.groups}}. Those are
#'                                         selected by employing limma on the methylation matrix.
#'                                  \item{"\code{houseman2012}"} The 50k sites reported as cell-type specific in the Houseman's reference-
#'                                         based deconvolution. See Houseman et.al. 2012.
#'                                  \item{"\code{houseman2014}"} Selects the sites said to be linked to cell type composition by \code{RefFreeEWAS},
#'                                         which is similar to surrogate variable analysis. See Houseman et.al. 2014.
#'                                  \item{"\code{jaffe2014}"} The sites stated as related to cell-type composition Jaffe et.al. 2014.
#'                                  \item{"\code{rowFstat}"} Markers are selected as those found to be associated to the reference cell
#'                                         types with F-statistics. If this option is selected, \code{REF_DATA_SET} and \code{REF_PHENO_COLUMN}
#'                                         need to be specified.
#'                                  \item{"\code{random}"} Sites are randomly selected.
#'                                  \item{"\code{pca}"} Sites are selected as those with most influence on the principal components.
#'                                  \item{"\code{var}"} Selects the most variable sites.
#'                                  \item{"\code{hybrid}"} Selects (N_MARKERS/2) most variable and (N_MARKERS/2) random sites.
#'                                  \item{"\code{range}"} Selects the sites with the largest difference between minimum and maximum
#'                                       across samples.
#'                                  \item{"\code{pcadapt}"} Uses principal component analysis as implemented in the \code{"bigstats"}
#'                                       R package to determine sites that are significantly linked to the potential cell types. This
#'                                       requires specifying K a priori (argument \code{K.prior}). We thank Florian Prive and Sophie
#'                                       Achard for providing the idea and parts of the codes.
#'                                  \item{"\code{edec_stage0}} Employs EDec's stage 0 to infer cell-type specific markers. By default
#'                                       EDec's example reference data is provided. If a specific data set is to be provided, it needs
#'                                       to be done through \code{REF_DATA_SET}.
#'                                  \item{"\code{custom}"} Specifying a custom file with indices.
#'                         }
#' @param n.markers The number of sites to be selected. Defaults to 5000.
#' @param remove.correlated Flag indicating if highly correlated features are to be removed.
#' @param cor.threshold Numeric indicating a threshold above which sites are not to be considered in the feature selection.
#'          If \code{"quantile"}, sites correlated higher than the 95th quantile are removed.
#' @param write.files Flag indicating if the selected sites are to be stored on disk.
#' @param n.prin.comp Optional argument deteriming the number of prinicipal components used for selecting the most important sites.
#' @param range.diff Optional argument specifying the difference between maximum and minimum required.
#' @param custom.marker.file Optional argument containing a file that specifies the indices used for employing MeDeCom.
#' @param store.heatmaps Flag indicating if a heatmap of the selected input sites is to be create from the input methylation matrix.
#'                       The files are then stored in the 'heatmaps' folder in WD.
#' @param heatmap.sample.col Column name in the phenotypic table of \code{rnb.set}, used for creating a color scheme in the heatmap.
#' @param sample.subset Vector of indices of samples to be included in the analysis. If \code{NULL}, all samples are included.
#' @param k.fixed Columns in the T matrix that should be fixed. If \code{NULL}, no columns are fixed.
#' @param K.prior K determined from visual inspection. Only has an influence, if \code{MARKER_SELECTION="pcadapt"}.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @param opt.method Optimization method to be used. Either MeDeCom.quadPen or MeDeCom.cppTAfact (default).
#' @param startT Inital matrix for T.
#' @param startA Initial matrix for A.
#' @param folds Integer representing the number of folds used in the analysis.
#' @param cores Integer representing the number of cores to be used in the analysis.
#' @param itermax Maximum number of iterations
#' @param ninit Number if initialtions.
#' @param cluster.submit Flag indicating, if the jobs are to be submitted to a scientific compute cluster (only SGE supported).
#' @param cluster.Rdir Path to an executable version of R.
#' @param cluster.hostlist Regular expression, on which basis hosts are selected in the cluster environment.
#' @param cluster.memlimit the \code{memlimit} resource value of the cluster submission.
#' @param cleanup Flag indicating if temprary files are to be deleted.
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the MeDeCom experiment.
#' @export
#' @author Michael Scherer

start_decomp_pipeline <- function(rnb.set,
                                  Ks,
                                  lambda.grid,
                                  work.dir=getwd(),
                                  factorviz.outputs=F,
                                  analysis.name="Analysis",
                                  sample.selection.col=NA,
                                  sample.selection.grep=NA,
                                  pheno.cols=NA,
                                  id.column=rnb.getOption("identifiers.column"),
                                  normalization="none",
                                  ref.ct.column=NA,
                                  ref.rnb.set=NULL,
                                  ref.rnb.ct.column=NA,
                                  prepare.true.proportions=F,
                                  true.A.token=NA,
                                  houseman.A.token=NA,
                                  estimate.houseman.prop=F,
                                  filter.beads=!is.null(rnb.set@covg.sites),
                                  min.n.beads=3,
                                  filter.intensity=inherits(rnb.set, "RnBeadRawSet"),
                                  min.int.quant = 0.01,
                                  max.int.quant = 0.99, 
                                  filter.na=TRUE,
                                  filter.context=TRUE,
                                  filter.cross.reactive=TRUE,
                                  execute.lump=FALSE,
                                  remove.ICA=FALSE,
                                  conf.fact.ICA=FALSE,
                                  ica.setting=NULL,
                                  filter.snp=TRUE,
                                  filter.somatic=TRUE,
                                  snp.list=NULL,
                                  filter.coverage = hasCovg(rnb.set),
                                  min.coverage=5,
                                  min.covg.quant=0.05,
                                  max.covg.quant=0.95,
                                  marker.selection="var",
                                  n.markers=5000,
                                  remove.correlated=FALSE,
                                  cor.threshold="quantile",
                                  write.files=FALSE,
                                  n.prin.comp=10,
                                  range.diff=0.05,
                                  custom.marker.file="",
                                  store.heatmaps=F,
                                  heatmap.sample.col=NULL,
                                  sample.subset=NULL,
                                  k.fixed=NULL,
                                  K.prior=NULL,
                                  opt.method = "MeDeCom.cppTAfact",
                                  startT=NULL,
                                  startA=NULL,
                                  folds=10,
                                  cores=1,
                                  itermax=1000,
                                  ninit=100,
                                  cluster.submit=FALSE,
                                  cluster.Rdir=NA,
                                  cluster.hostlist="*",
                                  cluster.memlimit="5G",
                                  cleanup=FALSE
                                  ){
  if(inherits(rnb.set,"RnBeadSet")){
    data.prep <- prepare_data(RNB_SET=rnb.set,
                              WORK_DIR=work.dir,
                              analysis.name=analysis.name,
                              SAMPLE_SELECTION_COL=sample.selection.col,
                              SAMPLE_SELECTION_GREP=sample.selection.grep,
                              PHENO_COLUMNS=pheno.cols,
                              ID_COLUMN=id.column,
                              NORMALIZATION=normalization,
                              REF_CT_COLUMN=ref.ct.column,
                              REF_RNB_SET=ref.rnb.set,
                              REF_RNB_CT_COLUMN=ref.rnb.ct.column,
                              PREPARE_TRUE_PROPORTIONS=prepare.true.proportions,
                              TRUE_A_TOKEN=true.A.token,
                              HOUSEMAN_A_TOKEN=houseman.A.token,
                              ESTIMATE_HOUSEMAN_PROP=estimate.houseman.prop,
                              FILTER_BEADS=filter.beads,
                              MIN_N_BEADS=min.n.beads,
                              FILTER_INTENSITY=filter.intensity,
                              MIN_INT_QUANT = min.int.quant,
                              MAX_INT_QUANT = max.int.quant, 
                              FILTER_NA=filter.na,
                              FILTER_CONTEXT=filter.context,
                              FILTER_CROSS_REACTIVE=filter.cross.reactive,
                              execute.lump=execute.lump,
                              FILTER_SNP=filter.snp,
                              FILTER_SOMATIC=filter.somatic,
                              snp.list=snp.list,
                              remove.ICA=remove.ICA,
                              conf.fact.ICA=conf.fact.ICA,
                              ica.setting=ica.setting
    )
  }else if(inherits(rnb.set,"RnBiseqSet")){
    data.prep <- prepare_data_BS(RNB_SET = rnb.set,
                                WORK_DIR = work.dir,
                                analysis.name = analysis.name,
                                SAMPLE_SELECTION_COL = sample.selection.col,
                                SAMPLE_SELECTION_GREP = sample.selection.grep,
                                REF_CT_COLUMN=ref.ct.column,
                                PHENO_COLUMNS=pheno.cols,
                                PREPARE_TRUE_PROPORTIONS=prepare.true.proportions,
                                TRUE_A_TOKEN=true.A.token,
                                HOUSEMAN_A_TOKEN=houseman.A.token,
                                ID_COLUMN=id.column,
                                FILTER_COVERAGE = filter.coverage,
                                MIN_COVERAGE=min.coverage,
                                MIN_COVG_QUANT=min.covg.quant,
                                MAX_COVG_QUANT=max.covg.quant,
                                FILTER_NA=filter.na,
                                FILTER_SNP=filter.snp,
                                snp.list=snp.list,
                                FILTER_SOMATIC=filter.somatic
      
    )
  }
  cg_subsets <- prepare_CG_subsets(rnb.set=data.prep$rnb.set.filtered,
                                     MARKER_SELECTION=marker.selection,
                                     N_MARKERS=n.markers,
                                     REMOVE_CORRELATED = remove.correlated,
                                     COR_THRESHOLD = cor.threshold,
                                     WRITE_FILES=write.files,
                                     WD=work.dir,
                                     REF_DATA_SET=ref.rnb.set,
                                     REF_PHENO_COLUMN=ref.rnb.ct.column,
                                     N_PRIN_COMP=n.prin.comp,
                                     RANGE_DIFF=range.diff,
                                     CUSTOM_MARKER_FILE=custom.marker.file,
                                     store.heatmaps=store.heatmaps,
                                     heatmap.sample.col=heatmap.sample.col,
                                     K.prior = K.prior
  )
  if("RefMeth" %in% names(data.prep)){
    trueT <- data.prep$RefMeth
  }else{
    trueT <- NULL
  }
  if("RefProps" %in% names(data.prep)){
    trueA <- data.prep$RefProps
  }else{
    trueA <- NULL
  }
  medecom.result <- start_medecom_analysis(rnb.set=data.prep$rnb.set.filtered,
                                             WORK_DIR=work.dir,
                                             cg_groups=cg_subsets,
                                             Ks=Ks,
                                             LAMBDA_GRID=lambda.grid,
                                             SAMPLE_SUBSET=sample.subset,
                                             K_FIXED=k.fixed,
                                             WRITE_FILES=write.files,
                                             factorviz.outputs=factorviz.outputs,
                                             opt.method = opt.method,
                                             startT = startT,
                                             startA = startA,
                                             trueT = trueT,
                                             trueA = trueA,
                                             analysis.name=analysis.name,
                                             folds=folds,
                                             cores=cores,
                                             itermax=itermax,
                                             ninit=ninit,
                                             CLUSTER_SUBMIT=cluster.submit,
                                             CLUSTER_RDIR=cluster.Rdir,
                                             CLUSTER_HOSTLIST=cluster.hostlist,
                                             CLUSTER_MEMLIMIT=cluster.memlimit,
                                             CLEANUP=cleanup
  )
  return(medecom.result)
}

#' A small RnBeads object used to run the examples.
#' @name rnb.set.example
#' @docType data
#' @author Michael Scherer
NULL
lutsik/DecompPipeline documentation built on Oct. 13, 2019, 1:51 a.m.