ARPPA: An R Package for Protoarray Analysis

####################
##this is r developing code for running simulation to generate 
##the DNA/protein microarray data
## the model (ref: smyth 2004, Bayesian hierarchical model and linear model)
##  linear model:
##             Y_ijk=alpha_i+beta_j+ gamma_ij+ epsilon_ijk
##		see the doc named: "Microarray data simulation.doc"
##    feng@BU   09/03/2016
##
##	Note: Oct 1st 2016, the code listed here is the old one that before 
##		writing them up into package functions. We keep them for reference
##
####################################
library(limma)

set.seed(2004);
#define the variables
numTreatment<-2   #number of different beta
numGene<-1000 #number of alpha
sampleSize<-100 ##this is the number of repeats for each group

alpha0_sigma<-2  #variance for alpha prior
beta0_sigma<-3   #variance for beta prior

#v0<-10  # the unscaled factor for gamma given gamma<=>0
gamma0_diff_sigma<-3

#the effects are assumed to be non negative and also fixed effect
#but following a normal distribution
alpha0<-(rnorm(numGene, 0,alpha0_sigma))
beta0<-(rnorm(numTreatment,0,beta0_sigma))

# 
p_diff<-0.01
numGene_diff<-floor(numGene*p_diff)

#priors for variance distribution
d0<-5
sSquare0<-2


gamma0<-matrix(rep(0,numGene*numTreatment),nrow=numGene, byrow=T)

##in this following code snip, we randomly distribute the differential gamma into
##different positions across different treatment group
##other part is 
for(i in c(1:numTreatment))
{
	pos_diff<-sample(numGene, size=numGene_diff, replace=T)
	gamma0[pos_diff,i]<-(rnorm(numGene_diff,0,gamma0_diff_sigma))
}

##now ready to generate variance
gamma0_var<-matrix(rchisq(numGene*numTreatment, df=d0),nrow=numGene, byrow=T)
gamma0_var<-1/(d0*sSquare0*sSquare0)*gamma0_var
gamma0_var<-1/gamma0_var

#now we have everything ready, do the observation values
#put them into matrix first
groupInfo<-rep(0,sampleSize*numTreatment)
Y_ijk<-matrix(rep(0,numGene*numTreatment*sampleSize),nrow=numGene,byrow=T)
for(j in c(1:numTreatment)) #for different treatment
{
	samplePos<-c(1:sampleSize)+(j-1)*sampleSize;
	groupInfo[samplePos]<-j;
	for(k in c(1:numGene))
	{
		#write the meta data first
		Y_ijk[k,samplePos]<-alpha0[k]+beta0[j]+gamma0[k,j]
		Y_ijk[k,samplePos]<-Y_ijk[k,samplePos]+rnorm(sampleSize,0,sqrt(gamma0_var[k,j]))
	}
}


##==========>now testing the function code in the module
library(ARPPA)

nGene<-2000
nTreatment<-numTreatment   #number of different beta
sampleSize<-20 ##this is the number of repeats for each group

alpha.mean<-0  #variance for alpha prior
alpha.sigma<-alpha0_sigma
beta.mean<-0
beta.sigma<-beta0_sigma   #variance for beta prior

#v0<-10  # the unscaled factor for gamma given gamma<=>0
gammaN0.sigma<-gamma0_diff_sigma

p_diff<-0.01

#priors for variance distribution
d0<-10
s0<-2

#call it
set.seed(2004);
dataExp<-simulateExpression(nGene, nTreatment, sampleSize,
					alpha.mean=alpha.mean, alpha.sigma=alpha.sigma,
					beta.mean=beta.mean, beta.sigma=beta.sigma,
					prob.nonZero=0.01, gamma.sigma=gammaN0.sigma,
					epsilon.d0=d0, epsilon.s0=s0
					)

##now start fitting the model to estimate the EBayes parameters
#defining a S3 function for calculating the prior variance and df 
#based on the observed variance
calculatePriorVariance<-function (x,df)
{
	eg<-log(x)-digamma(df/2)+log(df/2)
	eg_bar<-mean(eg)
	tg_d02<-mean((eg-eg_bar)*(eg-eg_bar)*length(x)/(length(x)-1)-trigamma(df/2))
	d0<-trigammaInverse(tg_d02)*2
	s02<-exp(eg_bar+digamma(d0/2)-log(d0/2))
	
	prior<-list(d0=d0, s02=s02)
	
} 

scaledChiSq_Pdf<-function(x, d0, s0)
{
	x_t<-d0*s0*s0/(x*x)
	#x_t<-sqrt(x_t)
	1/(2^(d0/2)*gamma(d0/2))*x_t^(d0/2-1)*exp(-1*x_t/2)
}

####testing code
###following the example for R help for eBayes
#  See also lmFit examples

#  Simula2te gene expression data,
#  6 microarrays and 100 genes with one gene differentially expressed
set.seed(2004); invisible(runif(100))
M <- matrix(rnorm(100000*60,sd=0.3),100000,60)
M[1,] <- M[1,] + 1
fit <- lmFit(M)

#  Moderated t-statistic
fit <- eBayes(fit)

x<-fit$sigma*fit$sigma
df<-fit$df.residual

r<-sampleVariancePrior(x,df)

#testing code 2
#testing the data generated by the Y_ijk model above
#first got the variance 

Y_ijk<-dataExp$exp
tY<-data.frame(t(Y_ijk))
tY$group<-c(rep(1,sampleSize),rep(2,sampleSize))
varY<-aggregate(tY,by=list(tY$group),FUN=var) #by doing aggregate, the new dataframe is arranged to have first column for "by" criteria.
		#that is why in the next step, we need to get rid of the first column and last one, since the last one is the one we added as the group info.
varK<-varY[,c(-1,-nGene-2)]

x<-as.numeric(c(varK[1,],varK[2,]))
df<-rep(sampleSize-1,length(x))

r2<-calculatePriorVariance(x,df)