R/start_analysis.R

Defines functions start.decomp.pipeline start.medecom.analysis start.refreeewas.analysis start.edec.analysis start.analysis

Documented in start.analysis start.decomp.pipeline start.edec.analysis start.medecom.analysis start.refreeewas.analysis

#' start.analysis
#' 
#' Wrapper function to start one of the deconvolution algorithms \code{MeDeCom}, \code{RefFreeEWAS} or \code{EDec}
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param work.dir Working directory for the analysis.
#' @param cg.groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare.CG.subsets}}.
#' @param Ks Vector of integers used as components in MeDeCom.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @param method The method to be used for deconvolution. Can be one of \code{MeDeCom}, \code{RefFreeCellMix} or \code{EDec}.
#' @param ... Further parameters passed to \code{\link{start.medecom.analysis}}
#' @author Michael Scherer
#' @export
start.analysis <- function(meth.data=NULL,
                           rnb.set=NULL,
                           cg.groups,
                           Ks,
                           work.dir,
                           factorviz.outputs=FALSE,
                           method="MeDeCom",
                           ...){
  all.methods <- c("MeDeCom","RefFreeCellMix","EDec")
  if(!method %in% all.methods){
    stop(paste0("Invalid value for method. Needs to be one of ",all.methods))
  }
  if(method == "MeDeCom"){
    md.res <- start.medecom.analysis(meth.data = meth.data,
                                     rnb.set = rnb.set,
                                     cg.groups = cg.groups,
                                     Ks = Ks,
                                     work.dir = work.dir,
                                     factorviz.outputs = factorviz.outputs,
                                     ...)
  }else if(method == "RefFreeCellMix"){
    md.res <- start.refreeewas.analysis(meth.data=meth.data,
                                                    rnb.set=rnb.set,
                                                    cg.groups=cg.groups,
                                                    Ks=Ks,
                                                    work.dir=work.dir,
                                                    factorviz.outputs=factorviz.outputs)
  }else if(method == "EDec"){
    md.res <- start.edec.analysis(meth.data=meth.data,
                                  rnb.set=rnb.set,
                                  cg.groups=cg.groups,
                                  Ks=Ks,
                                  work.dir = work.dir,
                                  factorviz.outputs = factorviz.outputs)
  }
  return(md.res)
}

#' start.edec.analysis
#' 
#' This function executes EDec for the specified CpGs and the number of cell types K.
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param cg.groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare.CG.subsets}}.
#' @param Ks The number of cell types to be tested. Can be a single numeric value or an array of numbers.
#' @param work.dir The working directory to be used.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the EDec experiment.
#' @author Michael Scherer
#' @export
start.edec.analysis <- function(meth.data=NULL,
                                rnb.set=NULL,
                                cg.groups,
                                Ks,
                                work.dir=getwd(),
                                factorviz.outputs=FALSE){
  if(!requireNamespace('EDec')){
    stop("Missing required package 'EDec'. Please install it.")
  }
  if(is.null(meth.data) && is.null(rnb.set)){
    logger.error("No input methylation data provided")
  }
  if(is.null(meth.data)){
    if(inherits(rnb.set,"RnBSet")){
      meth.data <- meth(rnb.set,row.names=T)
    }else{
      logger.error("Invalid value for rnb.set")
    }
  }else if(!(grepl("cg",row.names(meth.data)))){
    stop("Rownames of methylation data need to be provided for EDec")
  }
  T.all <- list()
  A.all <- list()
  rss.all <- list()
  res.all <- list()
  for(i.group in 1:length(cg.groups)){
    logger.start(paste("Processing group:",i.group))
    group <- cg.groups[[i.group]]
    group <- row.names(meth.data)[group]
    rss.vec <- c()
    T.all <- list()
    A.all <- list()
    for(j.K in 1:length(Ks)){
      K <- Ks[j.K]
      logger.start(paste("Processing K:",K))
      edec.res <- EDec::run_edec_stage_1(meth.data,
                                   informative_loci = group,
                                   num_cell_types = K)
      rss.vec <- c(rss.vec,edec.res$res.sum.squares)
      T.all[[j.K]] <- edec.res$methylation
      A.all[[j.K]] <- t(edec.res$proportions)
      logger.completed()
    }
    rss.all[[i.group]] <- rss.vec
    res.all[[i.group]] <- list(T=T.all,A=A.all)
    logger.completed()
  }
  result <- as.MeDeComSet(res.all,cg_subsets=1:length(cg.groups),Ks=Ks,rss=rss.all,m.orig=nrow(meth.data),n.orig=ncol(meth.data))
  result@parameters$GROUP_LISTS <- cg.groups
  if(factorviz.outputs){
    store.path <- file.path(work.dir,"FactorViz_outputs")
    if(!file.exists(store.path)){
      dir.create(store.path)
    }
    if(!is.null(rnb.set)){
      result@parameters$ASSEMBLY <- assembly(rnb.set)
      ann.C <- annotation(rnb.set)
      ann.S <- pheno(rnb.set)
      save(ann.C,file=file.path(store.path,"ann_C.RData"))
      save(ann.S,file=file.path(store.path,"ann_S.RData"))
    }
    medecom.set <- result
    save(medecom.set,file=file.path(store.path,"medecom_set.RData"))
    save(meth.data,file=file.path(store.path,"meth_data.RData"))
  }
  return(result)
}

#' start.refreeewas.analysis
#' 
#' This function executes RefFreeCellMix for the specified CpGs and the number of cell types K.
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param cg.groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare.CG.subsets}}.
#' @param Ks The number of cell types to be tested. Can be a single numeric value or an array of numbers.
#' @param work.dir The working directory to be used.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the RefFreeCellMix experiment.
#' @author Michael Scherer
#' @export
start.refreeewas.analysis <- function(meth.data=NULL,
                                      rnb.set=NULL,
                                      cg.groups,
                                      Ks,
                                      work.dir=getwd(),
                                      factorviz.outputs=FALSE){
   stop('RefFreeEWAS is currently not available from CRAN!')
#  if(!requireNamespace("RefFreeEWAS")){
#    stop("Please install ReFreeEWAS from 'https://cran.r-project.org/web/packages/RefFreeEWAS/index.html'")
#  }else{
#    if(is.null(meth.data) && is.null(rnb.set)){
#      logger.error("No input methylation data provided")
#    }
#    if(is.null(meth.data)){
#      if(inherits(rnb.set,"RnBSet")){
#        meth.data <- meth(rnb.set)
#      }else{
#        logger.error("Invalid value for rnb.set")
#      }
#    }
#    res.all <- list()
#    devis.all <- list()
#    for(i.group in 1:length(cg.groups)){
#      logger.start(paste("Processing group:",i.group))
#      group <- cg.groups[[i.group]]
#      meth.sset <- meth.data[group,]
#      res.sset <- RefFreeCellMixArray(meth.sset,Klist=Ks)
#      devis <- tryCatch(RefFreeCellMixArrayDevianceBoots(res.sset,Y=meth.sset),error=function(e)e)
#      if(inherits(devis,"error")){
#        devis <- rep(NA,length(Ks))
#      }else{
#        devis <- colMeans(devis)
#      }
#      devis.all[[i.group]] <- devis
#      res.all[[i.group]] <- res.sset
#      logger.completed()
#    }
#    result <- as.MeDeComSet(res.all,cg_subsets=1:length(cg.groups),Ks=Ks,deviances=devis.all,m.orig=nrow(meth.data),n.orig=ncol(meth.data))
#    result@parameters$GROUP_LISTS <- cg.groups
#    if(factorviz.outputs){
#      store.path <- file.path(work.dir,"FactorViz_outputs")
#      if(!file.exists(store.path)){
#        dir.create(store.path)
#      }
#      if(!is.null(rnb.set)){
#        result@parameters$ASSEMBLY <- assembly(rnb.set)
#        ann.C <- annotation(rnb.set)
#        ann.S <- pheno(rnb.set)
#        save(ann.C,file=file.path(store.path,"ann_C.RData"))
#        save(ann.S,file=file.path(store.path,"ann_S.RData"))
#      }
#      medecom.set <- result
#      save(medecom.set,file=file.path(store.path,"medecom_set.RData"))
#      save(meth.data,file=file.path(store.path,"meth_data.RData"))
#    }
#    return(result)
#  }  
}

#' start.medecom.analysis
#' 
#' Wrapper for runMeDeCom, for data preprocessed through the DecompPipeline
#' 
#' @param meth.data A \code{matrix} or \code{data.frame} containing methylation information. If NULL, methylation information needs to be provided
#'                   through \code{rnb.set}
#' @param rnb.set An object of type \code{\link{RnBSet-class}} containing methylation and sample meta information.
#' @param work.dir Working directory for the analysis.
#' @param cg.groups List of CpG indices used for the analysis. Can be computed by \code{\link{prepare.CG.subsets}}.
#' @param Ks Vector of integers used as components in MeDeCom.
#' @param lambda.grid Vector of doubles representing the regularization parameter in MeDeCom.
#' @param sample.subset Vector of indices of samples to be included in the analysis. If \code{NULL}, all samples are included.
#' @param k.fixed Columns in the T matrix that should be fixed. If \code{NULL}, no columns are fixed.
#' @param write.files Flag indicating if intermediate results are to be stored.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @param opt.method Optimization method to be used. Either MeDeCom.quadPen or MeDeCom.cppTAfact (default).
#' @param startT Inital matrix for T.
#' @param startA Initial matrix for A.
#' @param trueT True value for the T matrix.
#' @param trueA True value for the A matrix.
#' @param analysis.name Name of the analysis.
#' @param folds Integer representing the number of folds used in the analysis.
#' @param cores Integer representing the number of cores to be used in the analysis.
#' @param itermax Maximum number of iterations
#' @param ninit Number if initialtions.
#' @param cluster.submit Flag indicating, if the jobs are to be submitted to a scientific compute cluster (only SGE supported).
#' @param cluster.R.dir Path to an executable version of R.
#' @param cluster.hostlist Regular expression, on which basis hosts are selected in the cluster environment.
#' @param cluster.memlimit the \code{memlimit} resource value of the cluster submission.
#' @param cleanup Flag indicating if temprary files are to be deleted.
#' @param analysis.info Information to be saved about the analysis. Just stored as info.
#' @param lambda.grid.type String represent the lambda grid that was chosen. Just stored as info.
#' @param analysis.token String specifying the type of analysis that was conducted. Just stored as info.
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the MeDeCom experiment.
#' @export
#' @author Pavlo Lutsik, Michael Scherer
#' @import MeDeCom
start.medecom.analysis<-function(
    meth.data=NULL,
		rnb.set=NULL,
		work.dir=getwd(),
		cg.groups,
		Ks,
		lambda.grid,
		sample.subset=NULL,
		k.fixed=NULL,
		write.files=TRUE,
		factorviz.outputs=F,
		opt.method = "MeDeCom.cppTAfact",
		startT=NULL,
		startA=NULL,
		trueT = NULL,
		trueA = NULL,
		analysis.name="MeDeComRun",
		folds=10,
		cores=1,
		itermax=1000,
		ninit=100,
		cluster.submit=FALSE,
		cluster.R.dir=NA,
		cluster.hostlist="*",
		cluster.memlimit="5G",
		cleanup=FALSE,
		analysis.info=NULL,
		lambda.grid.type="standard",
		analysis.token="customAnalysis"
){

  
  if(!is.null(analysis.info)){
    ANALYSIS_ID<-paste(
      analysis.info$DATASET, 
      analysis.info$DATA_SUBSET,
      analysis.info$NORMALIZATION,
      analysis.info$QUALITY_FILTERING, 
      analysis.info$MARKER_SELECTION,
      lambda.grid.type,
      analysis.token,
      sep="_")
  }else{
    ANALYSIS_ID<-"customAnalysis"
    analysis.info<-list()
  }
  analysis.info$ANALYSIS<-ANALYSIS_ID
  
  work.dir <- file.path(work.dir,analysis.name)
  if(!file.exists(work.dir)){
    dir.create(work.dir)
  }
  log.file <- file.path(work.dir,"analysis.log")
  if(!file.exists(log.file)){
    if(logger.isinitialized()){
      logger.close()
      logger.start(fname=log.file)
    }else{
      logger.start(fname=log.file)
    }
  }
	
	if(is.na(cluster.R.dir)){
		cluster.R.dir="/usr/bin/"
	}
			
  if(is.null(meth.data)){
  	if(is.null(rnb.set)){
  		load(sprintf("%s/data.set.RData", work.dir))
  	}else{
  		meth.data<-meth(rnb.set)
  	}
  }
		
	if(write.files){
		saveRDS(lambda.grid, file=sprintf("%s/lambda.grid.RDS", work.dir))
		if(!is.null(sample.subset)){
			saveRDS(sample.subset, file=sprintf("%s/sample.subset.RDS", work.dir))
		}else{
			sample.subset<-1:ncol(meth.data)
		}
	}
	
	groups<-1:length(cg.groups)
	
	if(!is.null(k.fixed)){
		saveRDS(k.fixed, file=sprintf("%s/fixed_T_cols.RDS", work.dir))
	}else{
		k.fixed<-NULL
	}
		
	if(!is.null(trueT)){
	  if(is.character(trueT)){
	    if(!file.exists(trueT)){
	      logger.error(paste("File for trueT",trueT,"does not exist."))
	    }else{
		    load(trueT)
	    }
	  }else if(!is.matrix(trueT)){
	    logger.error("Invalid value for trueT")
	  }
	}
	
	if(!is.null(trueA)){
	  if(is.character(trueA)){
	    if(!file.exists(trueA)){
	      logger.error(paste("File for trueA",trueA,"does not exist."))
	    }else{
	      load(trueA)
	    }
	  }else if(!is.matrix(trueA)){
	    logger.error("Invalid value for trueA")
	  }
	}
	if(cluster.submit){
		cluster.settings=list(R_bin_dir=cluster.R.dir, host_pattern=cluster.hostlist, mem_limit=cluster.memlimit)
	}else{
		cluster.settings=NULL
	}
	
	result<-runMeDeCom(data=meth.data, 
			Ks=Ks,
			lambdas=lambda.grid,
			opt.method=opt.method,
			cg_subsets=cg.groups,
			sample_subset=sample.subset,
			startT=startT,
			startA=startA,
			trueT=trueT,
			trueA=trueA,
			fixed_T_cols=k.fixed,
			NINIT=ninit, 
			ITERMAX=itermax, 
			NFOLDS=folds,
			N_COMP_LAMBDA=4,
			NCORES=cores,
			analysis.name=analysis.name,
			use.ff=FALSE,
			cluster.settings=cluster.settings,
			temp.dir=work.dir,
			cleanup=cleanup,
			verbosity=1L,
			time.stamps=TRUE
	)
	
	result@parameters$ANALYSIS <- analysis.name
	result@parameters$GROUP_LISTS <- cg.groups
	result@parameters$ITERMAX<-itermax
	result@parameters$NFOLDS<-folds
	result@parameters$NINIT<-ninit
	
	analysis.info$GROUP_LISTS<-cg.groups
	analysis.info$ANALYSIS_DATE<-date()
	result@dataset_info<-c(result@dataset_info, analysis.info)
	
	if(write.files){
		saveRDS(result, file=file.path(work.dir, "collected.result.RDS"))
	}
	
	if(factorviz.outputs){
	  store.path <- file.path(work.dir,"FactorViz_outputs")
	  if(!file.exists(store.path)){
	    dir.create(store.path)
	  }
	  if(!is.null(rnb.set)){
	    result@parameters$ASSEMBLY <- assembly(rnb.set)
	    ann.C <- annotation(rnb.set)
	    ann.S <- pheno(rnb.set)
	    save(ann.C,file=file.path(store.path,"ann_C.RData"))
	    save(ann.S,file=file.path(store.path,"ann_S.RData"))
	  }
	  medecom.set <- result
	  save(medecom.set,file=file.path(store.path,"medecom_set.RData"))
	  save(meth.data,file=file.path(store.path,"meth_data.RData"))
	  if(!is.null(trueT)){
	    ref.meth <- trueT
	    save(ref.meth,file=file.path(store.path,"ref_meth.RData"))
	  }
	  if(!is.null(trueA)){
	    ref.props <- trueA
	    save(ref.props,file=file.path(store.path,"ref_props.RData"))
	  }
	}
	return(result)
}

#' start.decomp.pipeline
#' 
#' Main workhorse of the DecompPipeline R-package. Performs preprocessing (\code{\link{prepare.data}} or \code{\link{prepare.data.BS}}),
#' CpG subset selection (\code{\link{prepare.CG.subsets}}) and deconvolution (\code{\link{start.medecom.analysis}}, \code{\link{start.refreeewas.analysis}}, \code{\link{start.edec.analysis}})
#' 
#' @param rnb.set An object of type \code{\link[RnBeads]{RnBSet-class}} for which analysis is to be performed.
#' @param Ks Vector of integers used as components in MeDeCom.
#' @param lambda.grid Vector of doubles representing the regularization parameter in MeDeCom.
#' @param work.dir A path to a existing directory, in which the results are to be stored
#' @param analysis.name A string representing the dataset for which analysis is to be performed. Only used to create a folder with a 
#'                 descriptive name of the analysis.
#' @param sample.selection.col A column name in the phenotypic table of \code{rnb.set} used to selected a subset of samples for
#'                 analysis that contain the string given in \code{sample.selection.col}.
#' @param sample.selection.grep A string used for selecting samples in the column \code{sample.selection.col}.
#' @param pheno.cols Vector of column names in the phenotypic table of \code{rnb.set} that is kept and exported for further 
#'                 exploration.
#' @param id.column Sample-specific ID column name in \code{rnb.set}
#' @param normalization normalization method to be performed before employing MeDeCom. Can be one of \code{"none","dasen","illumina","noob"} (BeadChip only).
#' @param ref.ct.column Column name in \code{rnb.set} used to extract methylation information on the reference cell types.
#' @param ref.rnb.set An object of type \code{\link[RnBeads]{RnBSet-class}} containing methylation information on reference cell types (BeadChip only).
#' @param ref.rnb.ct.column Column name in \code{ref.rnb.set} used to extract methylation information on the reference cell types (BeadChip only).
#' @param prepare.true.proportions Flag indicating if true proportions are either available in \code{rnb.set} or to be estimated 
#'                          with Houseman's reference-based deconvolution approach (BeadChip only).
#' @param true.A.token String present in the column names of \code{rnb.set} used for selecting the true proportions of the corresponding
#'                      cell types.
#' @param houseman.A.token Similar to \code{true.A.token}, but not containing the true proportions, rather the estimated proportions
#'                      by Houseman's method (BeadChip only).
#' @param estimate.houseman.prop If neither \code{true.A.token} nor \code{houseman.A.token} are given, the proportions of the reference
#'                      cell type are estimated with Houseman's approach (BeadChip only).
#' @param filter.beads Flag indicating, if site-filtering based on the number of beads available is to be conducted (BeadChip only).
#' @param min.n.beads Minimum number of beads required in each sample for the site to be considered for adding to MeDeCom (BeadChip only).
#' @param filter.intensity  Flag indicating if sites should be removed according to the signal intensities (the lowest and highest quantiles
#'                      given by \code{min.int.quant} and \code{max.int.quant}) (BeadChip only).
#' @param min.int.quant Lower quantile of intensities which is to be removed (BeadChip only).
#' @param max.int.quant Upper quantile of intensities which is to be removed (BeadChip only).
#' @param filter.na Flag indicating if sites with any missing values are to be removed or not.
#' @param filter.context Flag indicating if only CG probes are to be kept (BeadChip only).
#' @param filter.cross.reactive Flag indicating if sites showing cross reactivity on the array are to be removed.
#' @param execute.lump Flag indicating if the LUMP algorithm is to be used for estimating the amount of immune cells in a particular sample.
#' @param remove.ICA Flag indicating if independent component analysis is to be executed to remove potential confounding factor.
#'             If \code{TRUE},conf.fact.ICA needs to be specified.
#' @param conf.fact.ICA Column name in the sample annotation sheet representing a potential confounding factor.
#' @param ica.setting Optional argument setting up ICA.
#' @param filter.snp Flag indicating if annotated SNPs are to be removed from the list of sites according to RnBeads' SNP list. 
#' @param snp.list Path to a file containing CpG IDs of known SNPs to be removed from the analysis, if \code{filter.snp} is \code{TRUE}.
#' @param filter.sex.chromosomes Flag indicating if only somatic probes are to be kept.
#' CPG FILTERING (BS)
#' @param filter.coverage Flag indicating, if site-filtering based on coverage is to be conducted (BS only).
#' @param min.coverage Minimum number of reads required in each sample for the site to be considered for adding to MeDeCom (BS only).
#' @param min.covg.quant Lower quantile of coverages. Values lower than this value will be ignored for analysis (BS only).
#' @param max.covg.quant Upper quantile of coverages. Values higher than this value will be ignored for analysis (BS only).
#' CG_SUBSET SELECTION
#' @param marker.selection A vector of strings representing marker selection methods. Available method are \itemize{
#'                                  \item{"\code{all}"} Using all sites available in the input.
#'                                  \item{"\code{pheno}"} Selected are the top \code{n.markers} site that differ between the phenotypic
#'                                         groups defined in data preparation or by \code{\link{rnb.sample.groups}}. Those are
#'                                         selected by employing limma on the methylation matrix.
#'                                  \item{"\code{houseman2012}"} The 50k sites reported as cell-type specific in the Houseman's reference-
#'                                         based deconvolution. See Houseman et.al. 2012.
#'                                  \item{"\code{houseman2014}"} Selects the sites said to be linked to cell type composition by \code{RefFreeEWAS},
#'                                         which is similar to surrogate variable analysis. See Houseman et.al. 2014.
#'                                  \item{"\code{jaffe2014}"} The sites stated as related to cell-type composition Jaffe et.al. 2014.
#'                                  \item{"\code{rowFstat}"} Markers are selected as those found to be associated to the reference cell
#'                                         types with F-statistics. If this option is selected, \code{ref.rnb.set} and \code{ref.pheno.column}
#'                                         need to be specified.
#'                                  \item{"\code{random}"} Sites are randomly selected.
#'                                  \item{"\code{pca}"} Sites are selected as those with most influence on the principal components.
#'                                  \item{"\code{var}"} Selects the most variable sites.
#'                                  \item{"\code{hybrid}"} Selects (n.markers/2) most variable and (n.markers/2) random sites.
#'                                  \item{"\code{range}"} Selects the sites with the largest difference between minimum and maximum
#'                                       across samples.
#'                                  \item{"\code{pcadapt}"} Uses principal component analysis as implemented in the \code{"bigstats"}
#'                                       R package to determine sites that are significantly linked to the potential cell types. This
#'                                       requires specifying K a priori (argument \code{K.prior}). We thank Florian Prive and Sophie
#'                                       Achard for providing the idea and parts of the codes.
#'                                  \item{"\code{edec_stage0}} Employs EDec's stage 0 to infer cell-type specific markers. By default
#'                                       EDec's example reference data is provided. If a specific data set is to be provided, it needs
#'                                       to be done through \code{ref.rnb.set}.
#'                                  \item{"\code{custom}"} Specifying a custom file with indices.
#'                         }
#' @param n.markers The number of sites to be selected. Defaults to 5000.
#' @param remove.correlated Flag indicating if highly correlated features are to be removed.
#' @param cor.threshold Numeric indicating a threshold above which sites are not to be considered in the feature selection.
#'          If \code{"quantile"}, sites correlated higher than the 95th quantile are removed.
#' @param write.files Flag indicating if the selected sites are to be stored on disk.
#' @param n.prin.comp Optional argument deteriming the number of prinicipal components used for selecting the most important sites.
#' @param range.diff Optional argument specifying the difference between maximum and minimum required.
#' @param custom.marker.file Optional argument containing a file that specifies the indices used for employing MeDeCom.
#' @param store.heatmaps Flag indicating if a heatmap of the selected input sites is to be create from the input methylation matrix.
#'                       The files are then stored in the 'heatmaps' folder in WD.
#' @param heatmap.sample.col Column name in the phenotypic table of \code{rnb.set}, used for creating a color scheme in the heatmap.
#' @param sample.subset Vector of indices of samples to be included in the analysis. If \code{NULL}, all samples are included.
#' @param k.fixed Columns in the T matrix that should be fixed. If \code{NULL}, no columns are fixed.
#' @param K.prior K determined from visual inspection. Only has an influence, if \code{MARKER_SELECTION="pcadapt"}.
#' @param factorviz.outputs Flag indicating, if outputs should be stored to be compatible with FactorViz for data exploration
#' @param opt.method Optimization method to be used. Either MeDeCom.quadPen or MeDeCom.cppTAfact (default).
#' @param startT Inital matrix for T.
#' @param startA Initial matrix for A.
#' @param folds Integer representing the number of folds used in the analysis.
#' @param cores Integer representing the number of cores to be used in the analysis.
#' @param itermax Maximum number of iterations
#' @param ninit Number if initialtions.
#' @param cluster.submit Flag indicating, if the jobs are to be submitted to a scientific compute cluster (only SGE supported).
#' @param cluster.Rdir Path to an executable version of R.
#' @param cluster.hostlist Regular expression, on which basis hosts are selected in the cluster environment.
#' @param cluster.memlimit the \code{memlimit} resource value of the cluster submission.
#' @param cleanup Flag indicating if temprary files are to be deleted.
#' @return An object of type \code{\link{MeDeComSet}} containing the results of the MeDeCom experiment.
#' @export
#' @author Michael Scherer

start.decomp.pipeline <- function(rnb.set,
                                  Ks,
                                  lambda.grid,
                                  work.dir=getwd(),
                                  factorviz.outputs=F,
                                  analysis.name="Analysis",
                                  sample.selection.col=NA,
                                  sample.selection.grep=NA,
                                  pheno.cols=NA,
                                  id.column=rnb.getOption("identifiers.column"),
                                  normalization="none",
                                  ref.ct.column=NA,
                                  ref.rnb.set=NULL,
                                  ref.rnb.ct.column=NA,
                                  prepare.true.proportions=F,
                                  true.A.token=NA,
                                  houseman.A.token=NA,
                                  estimate.houseman.prop=F,
                                  filter.beads=!is.null(rnb.set@covg.sites),
                                  min.n.beads=3,
                                  filter.intensity=inherits(rnb.set, "RnBeadRawSet"),
                                  min.int.quant = 0.001,
                                  max.int.quant = 0.999, 
                                  filter.na=TRUE,
                                  filter.context=TRUE,
                                  filter.cross.reactive=TRUE,
                                  execute.lump=FALSE,
                                  remove.ICA=FALSE,
                                  conf.fact.ICA=FALSE,
                                  ica.setting=NULL,
                                  filter.snp=TRUE,
                                  filter.sex.chromosomes=TRUE,
                                  snp.list=NULL,
                                  filter.coverage = hasCovg(rnb.set),
                                  min.coverage=5,
                                  min.covg.quant=0.05,
                                  max.covg.quant=0.95,
                                  marker.selection="var",
                                  n.markers=5000,
                                  remove.correlated=FALSE,
                                  cor.threshold="quantile",
                                  write.files=FALSE,
                                  n.prin.comp=10,
                                  range.diff=0.05,
                                  custom.marker.file="",
                                  store.heatmaps=F,
                                  heatmap.sample.col=NULL,
                                  sample.subset=NULL,
                                  k.fixed=NULL,
                                  K.prior=NULL,
                                  opt.method = "MeDeCom.cppTAfact",
                                  startT=NULL,
                                  startA=NULL,
                                  folds=10,
                                  cores=1,
                                  itermax=1000,
                                  ninit=100,
                                  cluster.submit=FALSE,
                                  cluster.Rdir=NA,
                                  cluster.hostlist="*",
                                  cluster.memlimit="5G",
                                  cleanup=FALSE
                                  ){
  if(inherits(rnb.set,"RnBeadSet")){
    data.prep <- prepare.data(rnb.set=rnb.set,
                              work.dir=work.dir,
                              analysis.name=analysis.name,
                              sample.selection.col=sample.selection.col,
                              sample.selection.grep=sample.selection.grep,
                              pheno.columns=pheno.cols,
                              id.column=id.column,
                              normalization=normalization,
                              ref.ct.column=ref.ct.column,
                              ref.rnb.set=ref.rnb.set,
                              ref.rnb.ct.column=ref.rnb.ct.column,
                              prepare.true.proportions=prepare.true.proportions,
                              true.A.token=true.A.token,
                              houseman.A.token=houseman.A.token,
                              estimate.houseman.prop=estimate.houseman.prop,
                              filter.beads=filter.beads,
                              min.n.beads=min.n.beads,
                              filter.intensity=filter.intensity,
                              min.int.quant = min.int.quant,
                              max.int.quant = max.int.quant, 
                              filter.na=filter.na,
                              filter.context=filter.context,
                              filter.cross.reactive=filter.cross.reactive,
                              execute.lump=execute.lump,
                              filter.snp=filter.snp,
                              filter.sex.chromosomes=filter.sex.chromosomes,
                              snp.list=snp.list,
                              remove.ICA=remove.ICA,
                              conf.fact.ICA=conf.fact.ICA,
                              ica.setting=ica.setting
    )
  }else if(inherits(rnb.set,"RnBiseqSet")){
    data.prep <- prepare.data.BS(rnb.set = rnb.set,
                                work.dir = work.dir,
                                analysis.name = analysis.name,
                                sample.selection.col = sample.selection.col,
                                sample.selection.grep = sample.selection.grep,
                                ref.ct.column=ref.ct.column,
                                pheno.columns=pheno.cols,
                                prepare.true.proportions=prepare.true.proportions,
                                true.A.token=true.A.token,
                                houseman.A.token=houseman.A.token,
                                id.column=id.column,
                                filter.coverage = filter.coverage,
                                min.coverage=min.coverage,
                                min.covg.quant=min.covg.quant,
                                max.covg.quant=max.covg.quant,
                                filter.na=filter.na,
                                filter.snp=filter.snp,
                                snp.list=snp.list,
                                filter.sex.chromosomes=filter.sex.chromosomes,
                                execute.lump=execute.lump
      
    )
  }
  cg_subsets <- prepare.CG.subsets(rnb.set=data.prep$rnb.set.filtered,
                                     marker.selection=marker.selection,
                                     n.markers=n.markers,
                                     remove.correlated = remove.correlated,
                                     cor.threshold = cor.threshold,
                                     write.files=write.files,
                                     out.dir=work.dir,
                                     ref.rnb.set=ref.rnb.set,
                                     ref.pheno.column=ref.rnb.ct.column,
                                     n.prin.comp=n.prin.comp,
                                     range.diff=range.diff,
                                     custom.marker.file=custom.marker.file,
                                     store.heatmaps=store.heatmaps,
                                     heatmap.sample.col=heatmap.sample.col,
                                     K.prior = K.prior
  )
  if("RefMeth" %in% names(data.prep)){
    trueT <- data.prep$RefMeth
  }else{
    trueT <- NULL
  }
  if("RefProps" %in% names(data.prep)){
    trueA <- data.prep$RefProps
  }else{
    trueA <- NULL
  }
  medecom.result <- start.medecom.analysis(rnb.set=data.prep$rnb.set.filtered,
                                             work.dir=work.dir,
                                             cg.groups=cg_subsets,
                                             Ks=Ks,
                                             lambda.grid=lambda.grid,
                                             sample.subset=sample.subset,
                                             k.fixed=k.fixed,
                                             write.files=write.files,
                                             factorviz.outputs=factorviz.outputs,
                                             opt.method = opt.method,
                                             startT = startT,
                                             startA = startA,
                                             trueT = trueT,
                                             trueA = trueA,
                                             analysis.name=analysis.name,
                                             folds=folds,
                                             cores=cores,
                                             itermax=itermax,
                                             ninit=ninit,
                                             cluster.submit=cluster.submit,
                                             cluster.R.dir=cluster.Rdir,
                                             cluster.hostlist=cluster.hostlist,
                                             cluster.memlimit=cluster.memlimit,
                                             cleanup=cleanup
  )
  return(medecom.result)
}
CompEpigen/DecompPipeline documentation built on Nov. 3, 2023, 5:35 p.m.