R/resampling.R
In ddgraph: Distinguish direct and indirect interactions with Graphical Modelling

Documented in calculateNCPCRobustnessStats makeNCPCRobustness ncpcResampling

#
# Resampling using jackknife and bootstrap for NCPC
#


#' Estimate the NCPC robustness using either jackknife or bootstrap resampling. 
#'
#' Estimate the robustness of NCPC predictions (i.e. variable types: direct, joint, indirect, no dependence)
#' using resampling. Two type of resampling are available: bootstrap (where the whole dataset is resampled with 
#' replacement), and jackknifing (where 1 or more observation are removed at each resampling step). 
#'
#' NCPC is run for the resampled datasets and statistics is produced about how many times is each variable
#' assigned one of the four types (direct, joint, indirect, no dependence). The final call for each variable is then
#' made according to the following algorithm (#direct is number of times variable is called direct):
#' \enumerate{
#'   \item if #no dependence > #direct+joint+indirect => "no dependence"
#'   \item else if #indirect > #direct+joint => "indirect"
#'   \item else if #joint > #direct => "joint"
#'   \item else "direct"
#' }
#'
#' @title NCPC Robustness from resampling
#' @param obj the DDDataSet object
#' @param method the method to use to estimate how robust is the feature selection 
#'        (valid values: "jackknife", or "bootstrap").
#' @param method.param the parameter to method, either number of data points to remove 
#'        for "jackknife" (default: 1) or number of boostrap runs for "bootstrap" (default: 100).
#' @param verbose if to print out the progress
#' @param ... other parameters to pass to ncpc()
#'
#' @return NCPCRobustness object with the raw results from resampling and summarized results
#' @export
#' @examples
#' \dontrun{
#' # load the example data
#' data(mesoBin)
#' 
#' # run bootstrap resampling for NCPC with alpha=0.05
#' ncpcResampling(mesoBin$VM_SM, "bootstrap", 100, alpha=0.05)
#' # run bootstrap resampling for NCPC* with alpha=0.05
#' ncpcResampling(mesoBin$VM_SM, "bootstrap", 100, alpha=0.05, star=TRUE)
#' 
#' # run jackknifing for NCPC
#' ncpcResampling(mesoBin$VM_SM, "jackknife", 1, alpha=0.05)
#' }
ncpcResampling = function(obj, method="bootstrap", method.param=NULL, verbose=TRUE, ...){
	# consistency checking
	if( !(method %in% c("bootstrap", "jackknife")) ){
		stop(paste("Unknown method \"", method, "\" selected, valid values are: jackknife, bootstrap", sep=""))
	}
	
	if(!is.null(method.param) && !is.numeric(method.param))
		stop("Error: parameter \"method.param\" needs to be either NULL or a numeric value")
	
	# record the input parameters for this function
	params = list()
	for(p in names(formals())){
		if( p == "" | p == "..." | p == "obj")
			next
		
		params[[p]] = eval(parse(text=p))
	}
	
	# raw dataframe with the variables
	d = rawData(obj)
	nm = names(d)
	
	# find out the method
	if(method == "bootstrap"){
		if( is.null(method.param) )
			method.param = 100
		runs = method.param
		# make the selection matrix for bootstrap
		sel.matrix = matrix(sample(nrow(d), nrow(d)*runs, replace=T), nrow=runs)
		
	} else if(method == "jackknife"){
		if( is.null(method.param) )
			method.param = 1
		if( nrow(d) %% method.param != 0)
			stop("The number of jackknife data points to remove need to be an exact multiple to number of data points")
		runs = nrow(d) / method.param
		# the selection matrix for jackknife
		sel.matrix = - matrix(sample(nrow(d), nrow(d), replace=F), ncol=method.param)
		
	}	
	
	## the results list
	res = list()
	
	# do the jackknife/boostrap runs
	for(i in 1:runs){
		if(verbose)
			message("\n\nRun ", i, " / ", runs)
		
		sel = sel.matrix[i,]			
		nd = makeDDDataSet(d[sel,], name=paste(datasetName(obj), method, "run", i))
		
		# consistency checking, expect certain number of rows of data..
		if(method == "bootstrap")
			stopifnot(nrow(rawData(nd)) == nrow(d))
		if(method == "jackknife")
			stopifnot(nrow(rawData(nd)) == (nrow(d)-method.param))
			
		# make DDGraphs!
		eg = ncpc(nd, verbose=verbose, ...)
		
		if(i == 1){
			# append the parameter sets
			params = c(params, eg$params)
		}
		
		# extract the direct/joint/indirect etc		
		if( is.null(eg) )
			res[[i]] = list("direct"=NULL, "joint"=NULL, "indirect"=NULL, "conditional"=NULL,
				"ps"=NULL, "snp"=NULL)
		else{
			if( length(eg$direct) == 0 )
				snp = eg$joint
			else
				snp = eg$direct
				
			res[[i]] = list("direct"=nm[eg$direct], "joint"=nm[eg$joint], "indirect"=nm[eg$indirect],
				"conditional"=nm[eg$conditional], "directAndJoint"=nm[eg$directAndJoint], "jointIfNotDirect"=nm[snp])
				
			if(dataType(nd) == "binary")
				res[[i]][["positiveClassSize"]] = sum(nd$class)
		}
				
	}
	
	# make the resulting object
	calculateNCPCRobustnessStats(  makeNCPCRobustness(obj, res, params) )
	
}

#' Make a new NCPCRobustness object 
#'
#' Make a new NCPCRobustness object just with the raw resampling data and parameters used to generate them.
#' Should never directly use this function, but only via \code{DDDataSet::NCPCRobustness()}.
#'
#' @param dataset the DDDataSet object
#' @param raw the list of raw resampling classification of variables (direct, joint, etc..)
#' @param params the parameters used to generate the data (only the non-default one are listed)
#'
#' @return a new NCPCRobustness object
makeNCPCRobustness = function(dataset, raw, params){
	new("NCPCRobustness", dataset=dataset, raw=raw, params=params, tables=list(), runs=length(raw),
		enriched.pss = data.frame(), enriched.ps=data.frame(), not.enriched=data.frame(), final.calls=data.frame())	
}

#' Calculate NCPCRobustness statistics 
#'
#' Calculate the statistics for the NCPCRobustness object - this is separate from object 
#' construction for convenience of testing, should always be called after object creation. 
#' Never use directly (except for testing), use instead via \code{DDDataSet::NCPCRobustness()}.
#'
#' @param obj NCPCRobustness object
#' @return the modified NCPCRobustness object with the statistics calculated
calculateNCPCRobustnessStats = function(obj){	
	# get the raw results
	res = obj@raw
	
	# make the sorted tables
	for(name in c("direct", "joint", "indirect", "directAndJoint", "jointIfNotDirect", "conditional")){
		obj@tables[[name]] = sort(table(unlist(lapply(res, function(x) x[[name]]))), decreasing=TRUE)
	}
	obj@tables[["positiveClassSize"]] = table(unlist(lapply(res, function(x) x[["positiveClassSize"]])))
	
	var.names = variableNames(obj@dataset)
	
	# extract the counts in consistent ordering
	direct = rep(0, length(var.names))
	indirect = rep(0, length(var.names))
	joint = rep(0, length(var.names))
	conditional = rep(0, length(var.names))
	
	direct[match( names(obj@tables$direct), var.names)] = obj@tables$direct / obj@runs
	joint[match( names(obj@tables$joint), var.names)] = obj@tables$joint / obj@runs
	indirect[match( names(obj@tables$indirect), var.names)] = obj@tables$indirect / obj@runs
	conditional[match( names(obj@tables$conditional), var.names)] = obj@tables$conditional / obj@runs		
	not.enriched = round(1 - direct - indirect - joint - conditional, 10) # weed out any numerical error with round()

	# split variables into enriched or not	
	freq.enriched = cbind(direct+joint+indirect+conditional, not.enriched)
	
	enrich.calls = apply(freq.enriched, 1, which.max)
	enrich.entropy = apply(freq.enriched, 1, entropyFromFreq)
	enrich.fc = apply(freq.enriched, 1, foldChangeFromFreq)	
	# selectors for enriched and non-enriched variables
	sel.enrich = (enrich.calls == 1)
	sel.not.enrich = (enrich.calls == 2)
	
	stopifnot( all(sel.enrich | sel.not.enrich) ) # consistency check
	
	# put together all the data and select the no dependence ones
	enrich.all = data.frame(freq.enriched, enrich.fc, enrich.entropy)
	names(enrich.all) = c("informative", "no dependence", "fold difference", "uncertainty")
	rownames(enrich.all) = var.names
	obj@not.enriched = enrich.all[sel.not.enrich,]
	
	# split the enriched ones by direct/indirect/joint
	freq.pss = cbind(direct, joint, indirect)
	pss.entropy = apply(freq.pss, 1, entropyFromFreq)
	pss.calls = c("direct", "joint", "indirect")[apply(freq.pss, 1, which.max)]
	
	freq.ps = cbind(direct+joint, indirect)
	ps.entropy = apply(freq.ps, 1, entropyFromFreq) 
	ps.fc = apply(freq.ps, 1, foldChangeFromFreq)
	ps.calls = c("direct and joint", "indirect")[apply(freq.ps, 1, which.max)]
	
	enriched.pss = data.frame(freq.pss, pss.calls, pss.entropy, enrich.fc, enrich.entropy)
	enriched.ps = data.frame(freq.ps, ps.calls, ps.fc, ps.entropy, enrich.fc, enrich.entropy)
	
	names(enriched.pss) = c("direct", "joint", "indirect", "final call", "p/s/s uncertainty", 
		"enrichment fold difference", "enrichment uncertainty")
	names(enriched.ps) = c("direct and joint", "indirect", "final call", "ps/s fold difference", 
		"ps/s uncertainty", "enrichment fold difference", "enrichment uncertainty")
	rownames(enriched.pss) = var.names
	rownames(enriched.ps) = var.names
	
	obj@enriched.pss = enriched.pss[sel.enrich,]
	obj@enriched.ps = enriched.ps[sel.enrich,]
	
	# order by entropy
	obj@not.enriched = obj@not.enriched[ order(obj@not.enriched[,4]), ]
	obj@enriched.pss = obj@enriched.pss[ order(obj@enriched.pss[,5]), ]
	obj@enriched.ps = obj@enriched.ps[ order(obj@enriched.ps[,5]), ]	
	
	# make the final calls according to following algorithm:
	# 1) if not.enriched > enriched => not.enriched
	# 2) else if indirect > direct+joint => indirect
	# 3) else if joint > direct => joint
	# 4) else direct
	
	final.calls = data.frame("name"=var.names, "type"="", "probability"=0, stringsAsFactors=F)
	
	for(i in 1:nrow(final.calls)){
		if( not.enriched[i] > (1-not.enriched[i]) ){
			final.calls$type[i] = "no dependence"
			final.calls$probability[i] = not.enriched[i]
		} else if( conditional[i] > direct[i] + indirect[i] + joint[i] ){
			final.calls$type[i] = "conditional"
			final.calls$probability[i] = conditional[i]
		} else if( indirect[i] > direct[i] + joint[i] ){
			final.calls$type[i] = "indirect"
			final.calls$probability[i] = indirect[i]
		} else if( joint[i] > direct[i] ){
			final.calls$type[i] = "joint"
			final.calls$probability[i] = joint[i]
		} else {
			final.calls$type[i] = "direct"
			final.calls$probability[i] = direct[i]
		}
	}
	
	obj@final.calls = final.calls
		
	obj
}

Any scripts or data that you put into this service are public.

ddgraph documentation built on Nov. 17, 2017, 10:50 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ddgraph
Distinguish direct and indirect interactions with Graphical Modelling

R/resampling.R
In ddgraph: Distinguish direct and indirect interactions with Graphical Modelling

Defines functions ncpcResampling makeNCPCRobustness calculateNCPCRobustnessStats

Documented in calculateNCPCRobustnessStats makeNCPCRobustness ncpcResampling

Try the ddgraph package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ddgraph Distinguish direct and indirect interactions with Graphical Modelling

R/resampling.R In ddgraph: Distinguish direct and indirect interactions with Graphical Modelling

Defines functions ncpcResampling makeNCPCRobustness calculateNCPCRobustnessStats

Documented in calculateNCPCRobustnessStats makeNCPCRobustness ncpcResampling

Try the ddgraph package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ddgraph
Distinguish direct and indirect interactions with Graphical Modelling

R/resampling.R
In ddgraph: Distinguish direct and indirect interactions with Graphical Modelling