R/InterSIM.R

Defines functions InterSIM

Documented in InterSIM

InterSIM <- function(n.sample=500,cluster.sample.prop=c(0.30,0.30,0.40),
						delta.methyl=2.0,delta.expr=2.0,delta.protein=2.0,
						p.DMP=0.2,p.DEG=NULL,p.DEP=NULL,
						sigma.methyl=NULL,sigma.expr=NULL,sigma.protein=NULL,
						cor.methyl.expr=NULL,cor.expr.protein=NULL,
						do.plot=FALSE, sample.cluster=TRUE, feature.cluster=TRUE)
	{
	#---------------------------------------------------------------------------------------------------------------
	# n.sample = Number of samples to simulate
	# cluster.sample.prop = Proportion of samples in the clusters. The number of proportions entered is
	#                       used to determine the number of clusters in the simulated data. e.g. if (0.3,0.4,0.3) is
	#						entered then the number of clusters will be 3.
	# delta.methyl = Cluster mean shift for methylation data
	# delta.expr = Cluster mean shift for expression data
	# delta.protein = Cluster mean shift for protein data
	# p.DMP = proportion of DE CpGs (DE = Differentially Expressed)
	# p.DEG = proportion of DE mRNA, if NULL (default) mRNAs mapped by DE CpGs will be selected
	# p.DEP = proportion of DE protein, if NULL (default) proteins mapped by DE mRNAs will be selected
	# do.plot = TRUE to generate heatmap, default is FALSE
	# sample.cluster = TRUE, if clustering should be done on samples
	# feature.cluster = TRUE, if clustering should be done on genomic features
	# sigma.methyl = Covariance structure methylation data, if NULL precomputed values will be used
	# sigma.expr = Covariance structure mRNA data, if NULL precomputed values will be used
	# sigma.protein = Covariance structure Protein data, if NULL precomputed values will be used
	# --sigma.methyl,sigma.expr and sigma.protein="indep" gives covariance structure with diagonal elements only (Independent features)
	# cor.methyl.expr = Correlation between methylation and mRNA, if NULL precomputed values will be used
	# cor.expr.protein = Correlation between mRNA and protein, if NULL precomputed values will be used
	# -------------------------------------------------------------------------------------------------------------
	# Notes:
	# 1. Precomputed default values: The following values have been precalculated in the InterSIM package using Methylation, Gene expression
	#        and Protein expression data from the TCGA Ovarian Cancer study. These values are used in the simulation
	#        function by default.
	# 2. Flexibility :
	#        (i) If user wishes to apply his/her own data from other cancer types, he/she can precompute such data and name
	#        them as mentioned below (to replace such existing data) before using InterSIM algorithm.
	#        (ii) Moreover, if user wishes to apply his/her own custom correlations between the data types and correlation
	#        structure within each data type (not derived from any cancer type), he/she can precumpute such data and name
	#        them as mentioned below (to replace such existing data) before using InterSIM algorithm
	# ---------------------------------------------------------------------------------------------------------------
	# Default values in the function:
	# rho.methyl.expr = Correlation between gene level summary of methylation data (logit transformed) and gene expression
	# rho.expr.protein = Correlation between protein and corrresponding mapped protein expression
	# mean.M = Mean of the methylation data (logit transformed) by probe
	# mean.expr = Mean of the expresion data
	# mean.protein = mean of the protein data
	# cov.M = Covariance structure of methylation data (logit transformed)
	# cov.expr = Covariance structure of Gene expression data
	# cov.protein = Covariance structure of Protein data
	# methyl.gene.level.mean = mean of the methylation data summarized at gene level
	# mean.expr.with.mapped.protein = mean of the gene expression that maps to each protein
	# CpG.gene.map.for.DEG = CpG - gene mapping information
	# protein.gene.map.for.DEP = Protein - gene mapping information
	#---------------------------------------------------------------------------------------------------------------

	if (sum(cluster.sample.prop)!=1) stop("The proportions must sum up to 1")
	if (!length(cluster.sample.prop)>1) stop("Number of proportions must be larger than 1")
	if (p.DMP<0 | p.DMP>1) stop("p.DMP must be between 0 to 1")
	if (!is.null(p.DEG) && (p.DEG<0 | p.DEG>1)) stop("p.DEG must be between 0 and 1")
	if (!is.null(p.DEP) && (p.DEP<0 | p.DEP>1)) stop("p.DEP must be between 0 and 1")

	n.cluster <- length(cluster.sample.prop) 			# Number of clusters
	n.sample.in.cluster <- c(round(cluster.sample.prop[-n.cluster]*n.sample),
						n.sample - sum(round(cluster.sample.prop[-n.cluster]*n.sample)))	# Number of samples in clusters
	cluster.id <- do.call(c,sapply(1:n.cluster, simplify = FALSE, function(x) rep(x,n.sample.in.cluster[x])))

	#-----------------
	# Methylation
	#-----------------
	n.CpG <- ncol(cov.M) 								# Number of CpG probes  in the data
	# Covariance structure
	if (!is.null(sigma.methyl)){
		if (sigma.methyl=="indep") cov.str <- diag(diag(cov.M))  # Independent features
		else cov.str <- sigma.methyl                             # User defined covariance structure
	} else cov.str <- cov.M   							    	 # Dependent features based on the original data
	# Differenatially methylated CpGs (DMP)
	DMP <- sapply(1:n.cluster,function(x) rbinom(n.CpG, 1, prob = p.DMP))
	rownames(DMP) <- names(mean.M)
	d <- lapply(1:n.cluster,function(i) {
			effect <- mean.M + DMP[,i]*delta.methyl
			mvrnorm(n=n.sample.in.cluster[i], mu=effect, Sigma=cov.str)})
	sim.methyl <- do.call(rbind,d)
	sim.methyl <- rev.logit(sim.methyl) 						 # Transform back to beta values between (0,1)

	#-------------------
	# Gene expression
	#-------------------
	n.gene <- ncol(cov.expr) 									 # Number of genes in the data
	# Covariance structure
	if (!is.null(sigma.expr)){
		if (sigma.expr=="indep") cov.str <- diag(diag(cov.expr)) # Independent features
		else cov.str <- sigma.expr                               # User defined covariance structure
	} else cov.str <- cov.expr   							   	 # Dependent features based on the original data
	# Correlation between methylation and gene expression
	if (!is.null(cor.methyl.expr)){
		rho.m.e <- cor.methyl.expr 								 # User defined correlation, single value or vector
	} else rho.m.e <- rho.methyl.expr                            # Real corrrelation between methyl and mRNA
	# Differenatially expressed genes (DEG)
	if (!is.null(p.DEG)){
		DEG <- sapply(1:n.cluster,function(x) rbinom(n.gene, 1, prob = p.DEG))
		rownames(DEG) <- names(mean.expr)
	} else { DEG <- sapply(1:n.cluster,function(x){
			cg.name <- rownames(subset(DMP,DMP[,x]==1))
			gene.name <- as.character(CpG.gene.map.for.DEG[cg.name,]$tmp.gene)
			as.numeric(names(mean.expr) %in% gene.name)})
		rownames(DEG) <- names(mean.expr)}
	if(delta.expr==0) rho.m.e <- 0
	d <- lapply(1:n.cluster,function(i) {
			effect <- (rho.m.e*methyl.gene.level.mean+sqrt(1-rho.m.e^2)*mean.expr) + DEG[,i]*delta.expr
			mvrnorm(n=n.sample.in.cluster[i], mu=effect, Sigma=cov.str)})
	sim.expr <- do.call(rbind,d)

	#---------------------
	# Protein Expression
	#---------------------
	n.protein <- ncol(cov.protein) 							       # Number of genes in the data
	# Covariance structure
	if (!is.null(sigma.protein)){
		if (sigma.protein=="indep") cov.str <- diag(diag(cov.protein))# Independent features
		else cov.str <- sigma.protein                                 # User defined covariance structure
	} else cov.str <- cov.protein   							      # Dependent features based on the original data
	# Correlation between gene expression and protein expression
	if (!is.null(cor.expr.protein)){
		rho.e.p <- cor.expr.protein 							   # User defined correlation, single value or vector
	} else rho.e.p <- rho.expr.protein                             # Real corrrelation between mRNA and protein
	# Differenatially expressed proteins (DEP)
	if (!is.null(p.DEP)){
		DEP <- sapply(1:n.cluster,function(x) rbinom(n.protein, 1, prob = p.DEP))
		rownames(DEP) <- names(mean.protein)
	} else { DEP <- sapply(1:n.cluster, function(x){
			 gene.name <- rownames(subset(DEG,DEG[,x]==1))
			 protein.name <- rownames(protein.gene.map.for.DEP[protein.gene.map.for.DEP$gene %in% gene.name,])
			 as.numeric(names(mean.protein) %in% protein.name)})
		rownames(DEP) <- names(mean.protein)}
	if(delta.protein==0) rho.e.p <- 0
	d <- lapply(1:n.cluster,function(i) {
			effect <- (rho.e.p*mean.expr.with.mapped.protein+sqrt(1-rho.e.p^2)*mean.protein) + DEP[,i]*delta.protein
			mvrnorm(n=n.sample.in.cluster[i], mu=effect, Sigma=cov.str)})
	sim.protein <- do.call(rbind,d)

	# Randomly order the samples in the data
	indices <- sample(1:n.sample)
	cluster.id <- cluster.id[indices]
	sim.methyl <- sim.methyl[indices,]
	sim.expr <- sim.expr[indices,]
	sim.protein <- sim.protein[indices,]
	rownames(sim.methyl) <- paste("subject",1:nrow(sim.methyl),sep="")
	rownames(sim.expr) <- paste("subject",1:nrow(sim.expr),sep="")
	rownames(sim.protein) <- paste("subject",1:nrow(sim.protein),sep="")
	d.cluster <- data.frame(rownames(sim.methyl),cluster.id)
	colnames(d.cluster)[1] <- "subjects"
	# Heatmaps
	if(do.plot){
	  hmcol <- colorRampPalette(c("blue","deepskyblue","white","orangered","red3"))(100)
	  if (dev.interactive()) dev.off()
	  if(sample.cluster && feature.cluster) {
	    dev.new(width=15, height=5)
	    par(mfrow=c(1,3))
	    aheatmap(t(sim.methyl),color=hmcol,Rowv=FALSE, Colv=FALSE, labRow=NA, labCol=NA,annLegend=T,main="Methylation",fontsize=10,breaks=0.5)
	    aheatmap(t(sim.expr),color=hmcol,Rowv=FALSE, Colv=FALSE, labRow=NA, labCol=NA,annLegend=T,main="Gene expression",fontsize=10,breaks=0.5)
	    aheatmap(t(sim.protein),color=hmcol,Rowv=FALSE, Colv=FALSE, labRow=NA, labCol=NA,annLegend=T,main="Protein expression",fontsize=10,breaks=0.5)}
	  else if(sample.cluster) {
	    dev.new(width=15, height=5)
	    par(mfrow=c(1,3))
	    aheatmap(t(sim.methyl),color=hmcol,Rowv=NA, Colv=FALSE, labRow=NA, labCol=NA,annLegend=T,main="Methylation",fontsize=8,breaks=0.5)
	    aheatmap(t(sim.expr),color=hmcol,Rowv=NA, Colv=FALSE, labRow=NA, labCol=NA,annLegend=T,main="Gene expression",fontsize=8,breaks=0.5)
	    aheatmap(t(sim.protein),color=hmcol,Rowv=NA, Colv=FALSE, labRow=NA, labCol=NA,annLegend=T,main="Protein expression",fontsize=8,breaks=0.5)}
	  else if(feature.cluster){
	    dev.new(width=15, height=5)
	    par(mfrow=c(1,3))
	    aheatmap(t(sim.methyl),color=hmcol,Rowv=FALSE, Colv=NA, labRow=NA, labCol=NA,annLegend=T,main="Methylation",fontsize=8,breaks=0.5)
	    aheatmap(t(sim.expr),color=hmcol,Rowv=FALSE, Colv=NA, labRow=NA, labCol=NA,annLegend=T,main="Gene expression",fontsize=8,breaks=0.5)
	    aheatmap(t(sim.protein),color=hmcol,Rowv=FALSE, Colv=NA, labRow=NA, labCol=NA,annLegend=T,main="Protein expression",fontsize=8,breaks=0.5)}
	  else {
	    dev.new(width=15, height=5)
	    par(mfrow=c(1,3))
	    aheatmap(t(sim.methyl),color=hmcol,Rowv=NA, Colv=NA, labRow=NA, labCol=NA,annLegend=T,main="Methylation",fontsize=8,breaks=0.5)
	    aheatmap(t(sim.expr),color=hmcol,Rowv=NA, Colv=NA, labRow=NA, labCol=NA,annLegend=T,main="Gene expression",fontsize=8,breaks=0.5)
	    aheatmap(t(sim.protein),color=hmcol,Rowv=NA, Colv=NA, labRow=NA, labCol=NA,annLegend=T,main="Protein expression",fontsize=8,breaks=0.5)}
	}
	return(list(dat.methyl=sim.methyl,dat.expr=sim.expr,dat.protein=sim.protein,clustering.assignment=d.cluster))
	}

Try the InterSIM package in your browser

Any scripts or data that you put into this service are public.

InterSIM documentation built on April 3, 2025, 8:52 p.m.