Nothing
sim.data <- function(n = 256, ng = 1000, nsg = 100, p.n.ratio = 0.5,
sg.pos.factor= 1, sg.neg.factor= -1,
# correlation info:
corr=FALSE, corr.factor=0.8,
# block info:
blocks=FALSE, n.blocks=6, nsg.block=1, ng.block=5,
seed=123, ...){
####################################################################################
# simulate microarray data with
# n - number of samples, logistic regression works well if n>200!
# ng - number of genes
# nsg - number of significant genes
# p.n.ratio - ratio between positive and negative significant genes (default 0.5)
# sg.pos.factor - impact factor of _positive_ significant genes on the classifaction (default 1)
# sg.neg.factor - impact factor of _negative_ significant genes on the classifaction (default -1)
# all other non-significant genes have in both classes balanced ratios
# for correlated blocks of genes
# n.blocks - number of correlated blocks of genes
# nsg.block - number of significant genes per block
# ng.block - number of genes per block
# if no blockes (n.blocks=0) are defined and corr=TRUE
# create covarance matrix for all genes! with decrease of correlation
####################################################################################
#require(MASS)
set.seed(seed)
# assumption I: intercept = 0, beta0=0
b0<- 0
# effect for positive genes = +log2(2) = +1
# negative genes = log2(1/2)= -1
m.pos<-m.neg<-m.bal<- 0
sd.pos<-sd.neg<-sd.bal<- 1
if (!( (p.n.ratio>=0) & (p.n.ratio<=1))) stop("rato between positive and negative significant genes should be in [0;1]")
pos.nsg<- floor (nsg * p.n.ratio)
neg.nsg<- nsg - pos.nsg
# covariance matrix
sigma<- .create.covariance.matrix(sd.pos, sd.neg, sd.bal,
ng,nsg, pos.nsg, neg.nsg,
corr,corr.factor,
blocks, n.blocks, nsg.block, ng.block)
# better to see
# sigma.see<-sigma; sigma.see[ sigma.see==0]<- ""; rm(sigma.see)
# 1 if pos, -1 by neg
bX<-rep(0,ng); bX[grep("pos",rownames(sigma))] <- sg.pos.factor
bX[grep("neg",rownames(sigma))] <- sg.neg.factor
bX = matrix(bX, ncol=1)
# means
means<- rep(0,ng); means[grep("pos",rownames(sigma))] <- m.pos; means[grep("neg",rownames(sigma))] <- m.neg
X <-t(mvrnorm(n, means, sigma))
colnames(X)<-c(1:n)
# Outcome
L <- t(X) %*% bX
# error of the model
Y <- ifelse(runif(n) < plogis(L), 1, -1)
return(list("x"=X,"y"=Y[,1], "seed"=seed))
}
#####################################################################################
`.create.covariance.matrix` <-
function(sd.pos, sd.neg, sd.bal,
ng,nsg, pos.nsg, neg.nsg,
corr,corr.factor,
blocks, n.blocks, nsg.block, ng.block){
# initialise covariance matrix sigma
sigma<- diag(ng)
dimnames(sigma)<- list(c(1:ng), c(1:ng))
# 1. no correlation: sigma:= I
if (!corr){
# first pos, then neg sig. genes, then balanced
rownames(sigma)<- c( paste("pos",c(1:pos.nsg),sep=""),
paste("neg",c(1:neg.nsg),sep=""),
paste("bal",c(1:(ng-nsg)),sep="") )
}
# 2. correlation & no blocks (sign + non-sig genes)
if (corr & !blocks){
# correlation, no blocks with non-sg genes
# --> first "positive block", then "negative" then "balanced"
# each block: diag=1, cov(i,j)=cov(j,i):=cor.factor^|i-j|
# first pos, then neg sig. genes, then balanced
rownames(sigma)<- c( paste("pos",c(1:pos.nsg),sep=""),
paste("neg",c(1:neg.nsg),sep=""),
paste("bal",c(1:(ng-nsg)),sep="") )
.help.cor.block<- function(len, corr.factor){
# each block: diag=1, cov(i,j)=cov(j,i):=cor.factor^|i-j|
tmp.sigma<-diag(len)
diag(tmp.sigma)[grep("pos",rownames(tmp.sigma))]<- sd.pos^2
diag(tmp.sigma)[grep("neg",rownames(tmp.sigma))]<- sd.neg^2
diag(tmp.sigma)[grep("bal",rownames(tmp.sigma))]<- sd.bal^2
for(row.i in 1:(len-1))
for(col.j in (row.i+1):len)
tmp.sigma[row.i,col.j] <- tmp.sigma[col.j,row.i] <- corr.factor^abs(row.i-col.j) * sqrt(tmp.sigma[row.i,row.i]) * sqrt(tmp.sigma[col.j,col.j])
return(tmp.sigma)
}
# pos sig genes
if (pos.nsg>0){
sigma[1:pos.nsg, 1:pos.nsg]<- .help.cor.block(len=pos.nsg, corr.factor = corr.factor )
rownames(sigma)[1:pos.nsg]<- paste("pos",c(1:pos.nsg),sep="")
}
# neg sig genes
if (neg.nsg>0){
sigma[(pos.nsg+1):(pos.nsg+neg.nsg), (pos.nsg+1):(pos.nsg+neg.nsg)]<- .help.cor.block(len=neg.nsg, corr.factor = corr.factor )
rownames(sigma)[(pos.nsg+1):(pos.nsg+neg.nsg) ]<- paste("neg",c(1:neg.nsg),sep="")
}
# bal genes
if ((ng-nsg)>0){
sigma[((pos.nsg+neg.nsg)+1): ng, (pos.nsg+neg.nsg+1): ng]<- .help.cor.block(len=(ng-nsg), corr.factor = corr.factor )
rownames(sigma)[((pos.nsg+neg.nsg)+1): ng ]<- paste("bal",c(1:(ng-nsg) ),sep="")
}
}
# 3. correlation & blocks (sign + non-sig genes)
if (corr & blocks){
# ith block: first nsg.block sign. genes then (ng.block-nsg.block) bal genes
# diag = 1, rest = corr.factor
# rest = diag 1
# cov(i,j)= cor.factor* sd(i)*sd(j)
# tricky if sd_pos , sd_neg and sd_bal are different
# 1. first find position of each gene
for (i in 1:n.blocks){
bl.start<-(i-1)*ng.block+1
bl.end<- i*ng.block
# add rownames
rownames(sigma)[bl.start : bl.end]<- c( paste("sig",c( ((i-1)*nsg.block + 1) : (i*nsg.block) ) ,sep=""),
paste("bal",c( ((i-1)*(ng.block-nsg.block) + 1) : (i*(ng.block-nsg.block)) ),sep="") )
}
# do we have some sg left? if not -> 0.
sg.rest<- max (nsg - nsg.block * n.blocks, 0 )
if (sg.rest > 0 ) rownames(sigma)[((ng.block * n.blocks) +1): (ng.block * n.blocks+sg.rest)]<-
paste("sig",c( ( nsg.block * n.blocks + 1) : (nsg) ) ,sep="")
# do we have some bal left?
bal.rest<- max( ( (ng-nsg) - ((ng.block-nsg.block) * n.blocks) ), 0 )
if (bal.rest > 0 ) rownames(sigma)[((ng.block * n.blocks)+sg.rest +1): (ng)]<- paste("bal",c( (((ng.block-nsg.block) * n.blocks) + 1) : (ng-nsg) ) ,sep="")
# correct the names for sig genes: sig --> first pos then neg
rownames(sigma)[grep("sig",rownames(sigma)) ]<- c( paste("pos",c(1:pos.nsg),sep=""),
paste("neg",c(1:neg.nsg),sep="") )
colnames(sigma)<- rownames(sigma)
# 2. fill the blocks
for (i in 1:n.blocks){
bl.start<-(i-1)*ng.block+1
bl.end<- i*ng.block
block.i<- sigma[bl.start : bl.end, bl.start : bl.end]
# fill the diagonal with sd_pos, sd_neg, sd_bal
diag(block.i)[grep("pos",rownames(block.i))]<- sd.pos^2
diag(block.i)[grep("neg",rownames(block.i))]<- sd.neg^2
diag(block.i)[grep("bal",rownames(block.i))]<- sd.bal^2
# fill the rest
# cov(i,j)= cor.factor* sd(i)*sd(j)
for (row.i in 1:(nrow(block.i)-1))
for (col.j in (row.i+1):ncol(block.i)){
block.i[row.i, col.j]<- block.i[col.j,row.i]<- corr.factor * sqrt(block.i[row.i,row.i]) * sqrt(block.i[col.j,col.j])
}
sigma[bl.start : bl.end, bl.start : bl.end]<- block.i
}
# user-fiendly reading: sigma.read<-sigma; sigma.read[sigma.read==0]<- ""; sigma.read[1:50,1:50]; rm(sigma.read)
}
# do we need this simplification?
# # if cov < 10^-3 set it to 0! (memory space reduction!)
# sigma[sigma< 10^-3]<- 0
return(sigma)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.