R/netinf.cv.R
In predictionet: Inference for predictive networks designed for (but not limited to) genomic data

### Function inferring a network
## data: matrix of continuous or categorical values (gene expressions for example); observations in rows, features in columns.
## categories: Either a single integer or a vector of integers specifying the number of categories used to discretize each variable (data are then discretized using equal-frequency bins) or a list of cutoffs to use to discretize each of the variables in 'data' matrix. If method='bayesnet', this parameter should be specified by the user.
## perturbations: matrix of {0, 1} specifying whether a gene has been pertubed in some experiments. Dimensions should be the same than data
## priors: matrix of prior information avalable for gene-gene interaction (parents in columns, children in rows). Values may be probabilities or any other weights (citations count for instance)
## priors.count: 'TRUE' if priors specified by the user are number of citations for each interaction, 'FALSE' if probabilities are reported instead
## priors.weight: real value in [0, 1] specifying the weight to put on the priors (0=only the data are used, 1=only the priors are used to infer the topology of the network). If 'priors.weight' is missing it will be optimized gene by hene in an automatic way.
## maxparents: maximum number of parents allowed for each gene
## subset: vector of indices to select only subset of the observations
## method: "bayesnet" for bayesian network inference with the catnet package, "regrnet" for regression-based network inference
## nfold: number of folds for the cross-validation
## causal: 'TRUE' if the causality should be inferred from the data, 'FALSE' otherwise }
## seed: set the seed to make the cross-validation and network inference deterministic
`netinf.cv` <- 
function(data, categories, perturbations, priors, predn, priors.count=TRUE, priors.weight=0.5, maxparents=3, subset, method=c("regrnet", "bayesnet"),ensemble=FALSE, ensemble.maxnsol=3, predmodel=c("linear", "linear.penalized", "cpt"), nfold=10, causal=TRUE, seed, bayesnet.maxcomplexity=0, bayesnet.maxiter=100, verbose=FALSE) {
	if(!missing(seed)) { set.seed(seed) }
	method <- match.arg(method)
	predmodel <- match.arg(predmodel)
	if(missing(perturbations) || is.null(perturbations)) {
		perturbations <- matrix(FALSE, nrow=nrow(data), ncol=ncol(data), dimnames=dimnames(data))
	} else {
		if(nrow(perturbations) == 1) {
			perturbations[1, ] <- as.logical(perturbations[1, ])
		} else { perturbations <- apply(perturbations, 2, as.logical) }
		dimnames(perturbations) <- dimnames(data)
	}	
	if(!missing(subset)) {
		## select subset of the data (observations)
		data <- data[subset, , drop=FALSE]
		perturbations <- perturbations[subset, , drop=FALSE]
	}
	if(missing(predn) || is.null(predn) || length(predn) == 0) { predn <- 1:ncol(data) } else { if(is.character(predn)) { predn <- match(predn, dimnames(data)[[2]]) } else { if(!is.numeric(predn) || !all(predn %in% 1:ncol(data))) { stop("parameter 'predn' should contain either the names or the indices of the variables to predict!")} } }
	nn <- dimnames(data)[[2]][predn] ## variables (genes) to fit during network inference
	if(!missing(categories)) {
		## discretize gene expression data
		if(is.numeric(categories)) {
			if(length(categories) == 1) {
				## use the same number of categories for each variable
				categories <- rep(categories, ncol(data))
				names(categories) <- dimnames(data)[[2]]
			}
			cuts.discr <- lapply(apply(rbind("nbcat"=categories, data), 2, function(x) { y <- x[1]; x <- x[-1]; return(list(quantile(x=x, probs=seq(0, 1, length.out=y+1), na.rm=TRUE)[-c(1, y+1)])) }), function(x) { return(x[[1]]) })
		} else { cuts.discr <- categories }
		## discretize the actual gene expression data
		data.discr <- data.discretize(data=data, cuts=cuts.discr)
		categories <- cuts.discr
	} else {
		data.discr <- data
		categories <- lapply(apply(data, 2, function(x) { return(list(sort(unique(x)))) }), function(x) { return(x[[1]]) })
		names(categories) <- colnames(data)
	}
	
	## infer network from the whole dataset
	mynetglobal <- netinf(data=data, categories=categories, perturbations=perturbations, priors=priors, predn=predn, priors.count=priors.count, priors.weight=priors.weight, maxparents=maxparents, method=method, ensemble=ensemble, causal=causal)
	## and the global topology
	mytopoglobal <- mynetglobal$topology

	## compute folds for cross-validation
	if (nfold == 1) {
		k <- 1
		nfold <- nrow(data)
	} else { k <- floor(nrow(data) / nfold) } 
	## randomize observations to prevent potential bias in the cross-validation
	smpl <- sample(nrow(data))
	
	## perform cross-validation
	mymcc <- mynrmse <- myr2 <- mytopo <- mynets <- NULL
	mytopo2 <- NULL
	edge.relevance.cv <- NULL
	
	for (i in 1:nfold) {
		if(ensemble & verbose) {
			message(paste("fold: ",i," out of ",nfold,"folds"))
		}
		## fold of cross-validation
		if (i == nfold) { s.ix <- smpl[c(((i - 1) * k + 1):nrow(data))] }	else { s.ix <- smpl[c(((i - 1) * k + 1):(i * k))] }
		## s.ix contains the indices of the test set
		
		## infer network from training data and priors

		mynet <- netinf(data=data[-s.ix, , drop=FALSE], categories=categories, perturbations=perturbations[-s.ix, , drop=FALSE], priors=priors, predn=predn, priors.count=priors.count, priors.weight=priors.weight, maxparents=maxparents, method=method, ensemble=ensemble, ensemble.maxnsol=ensemble.maxnsol, causal=causal, bayesnet.maxcomplexity=bayesnet.maxcomplexity, bayesnet.maxiter=bayesnet.maxiter)
		mynet <- net2pred(net=mynet, data=data[-s.ix, , drop=FALSE], categories=categories, predn=predn, perturbations=perturbations[-s.ix, , drop=FALSE], method=predmodel)
		mynets <- c(mynets, list(mynet))
		
		#mynet <- c(mynet,list("topology.coeff"=.regrnet2matrixtopo(net=mynet)))
		mytopo2 <- c(mytopo2, list(mynet$topology))

		## edge relevance score
		edge.relevance.cv <- c(edge.relevance.cv,list(mynet$edge.relevance))
		## compute predictions
		mynet.pred <- netinf.predict(net=mynet, data=data[s.ix, , drop=FALSE], categories=categories, perturbations=perturbations[s.ix, , drop=FALSE], predn=predn, method=predmodel)
		## combine the multiple predictions computed by the ensemble method by averaging
		uu <- !duplicated(colnames(mynet.pred))
		if(sum(uu) < ncol(mynet.pred)) {
			ttt <- matrix(NA, nrow=nrow(data[s.ix, , drop=FALSE]), ncol=ncol(data[s.ix, , drop=FALSE]), dimnames=dimnames(data[s.ix, , drop=FALSE]))
			ttt[ , colnames(mynet.pred)[uu]] <- mynet.pred[ , uu]
			uu <- sort(unique(colnames(mynet.pred)[duplicated(colnames(mynet.pred))]))
			for(u in 1:length(uu)) {
				ttt[ , uu[u]] <- apply(mynet.pred[ , is.element(colnames(mynet.pred), uu[u])], 1, mean, na.rm=TRUE)
			}
			mynet.pred <- ttt
		}
		resnull <- rep(NA, ncol(data))
		names(resnull) <- colnames(data)
		
		## performance estimation: R2
		mynet.r2 <- resnull
		if(predmodel %in% c("linear", "linear.penalized")) {
			mynet.r2 <- pred.score(data=data[s.ix, , drop=FALSE], pred=mynet.pred, method="r2")
		}
			
		## performance estimation: NRMSE
		mynet.nrmse <- resnull
		if(predmodel %in% c("linear", "linear.penalized")) {
			mynet.nrmse <- pred.score(data=data[s.ix, , drop=FALSE], pred=mynet.pred, method="nrmse")
		}
		## performance estimation: MCC
		pred.discr <- mynet.pred
		if(predmodel %in% c("linear", "linear.penalized") && !missing(categories)) {
			## discretize predicted gene expression data 
			pred.discr <- data.discretize(data=mynet.pred, cuts=cuts.discr[dimnames(mynet.pred)[[2]]])
		}
		if(predmodel %in% c("cpt") || (predmodel %in% c("linear", "linear.penalized") && !missing(categories))) { 
			mynet.mcc <- pred.score(data=data.discr[s.ix, dimnames(mynet.pred)[[2]], drop=FALSE], pred=pred.discr, method="mcc")
		} else { 
			mynet.mcc <- rep(NA, length(nn))
			names(mynet.mcc) <- nn
		}
		
		## adjacency matrix
		topol <- mynet$topology
		## save results
		myr2 <- rbind(myr2, mynet.r2)
		mynrmse <- rbind(mynrmse, mynet.nrmse)
		mymcc <- rbind(mymcc, mynet.mcc)
		mytopo <- c(mytopo, list(topol))
	}
	dimnames(myr2)[[1]] <- dimnames(mynrmse)[[1]] <- dimnames(mymcc)[[1]] <- names(mytopo) <- names(mynets) <- names(edge.relevance.cv) <- paste("fold", 1:nfold, sep=".")
	
	## compute stability for each edge
	edgestab <- edgestab2 <- matrix(0, nrow=nrow(mytopo[[1]]), ncol=ncol(mytopo[[1]]), dimnames=dimnames(mytopo[[1]]))

	if(ensemble){
			edgestab <- .topo2stab(mytopo)
			edgestab2 <- .stab.cv2stab(edgestab,mytopoglobal)
	}else{
		for(i in 1:length(mytopo)) {
			edgestab <- edgestab + mytopo[[i]]
		}
		edgestab <- edgestab / length(mytopo)
		edgestab2[mytopoglobal == 1] <- edgestab[mytopoglobal == 1]

	}
	## report stability of edges present in the global network
	names(mytopo2) <- paste("fold", 1:nfold, sep=".")
	return(list("method"=method, "topology"=mytopoglobal, "topology.cv"=mytopo, "prediction.score.cv"=list("r2"=(myr2), "nrmse"=(mynrmse), "mcc"=(mymcc)), "edge.stability"=edgestab2, "edge.stability.cv"=edgestab, "edge.relevance"=mynetglobal$edge.relevance, "edge.relevance.cv"=edge.relevance.cv))
}

Any scripts or data that you put into this service are public.

predictionet documentation built on Nov. 8, 2020, 7:48 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

predictionet
Inference for predictive networks designed for (but not limited to) genomic data

R/netinf.cv.R
In predictionet: Inference for predictive networks designed for (but not limited to) genomic data

Defines functions `netinf.cv`

Try the predictionet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

predictionet Inference for predictive networks designed for (but not limited to) genomic data

R/netinf.cv.R In predictionet: Inference for predictive networks designed for (but not limited to) genomic data

Defines functions `netinf.cv`

Try the predictionet package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

predictionet
Inference for predictive networks designed for (but not limited to) genomic data

R/netinf.cv.R
In predictionet: Inference for predictive networks designed for (but not limited to) genomic data