#' Simulation for RVMMAT test
#'
#' This function use pre-defined parameters to make the simulation data for the RVMMAT test (including type I and power test)
#'
#' @param n.sample Numeric, sample size, number of individuals
#' @param n.time Numeric, number of measurements for each individual
#' @param par List, the parameters for the phenotype traits, including covariates and individual specific time dependent random effects
#' @param genetic.effect Numeric, genetic effect size (gamma) in genetic effect function
#' @param time_cov Logical variable, indicating whether time effect is included in phenotypic traits
#' @param snp.count Numeric, number of SNPs
#' @param intercept Logical variable, indicating whether intercept is used in phenotypic traits
#' @param disease.para List, the parameters for disease allele and its effect size for power simulation
#' @param onlypower Logical variable, indicating whether include disease SNPs in the generated SNPs
#' @param phe.model String, the phenotype model, two optional values: 'logistic', 'liability'
#' @param oversampling String, the ascertainment scheme, three optional value: 'random', 'baseline', 'sum'
#'
#'
#'
#' @return A list object is returned to be used as object for RVMMAT test
#' @export
rvmmat_simu<-function(n.sample=1000, n.time=5, par=list(),genetic.effect=0.6,
time_cov=TRUE,snp.count = 100, intercept=TRUE, disease.para = list(), onlypower = FALSE, phe.model = 'logistic', oversampling = "random")
{
if(missing(par) || length(par)==0 )
{
par <- list(b0 = -1.8, b1 = 0.5, b2= 0.5, btime = 1,
sig.a = 0.8, sig.b =0.8, sig.e = 1.4,
rho=0.7);
if(phe.model == "logistic"){
par$b0 = -1.9;gammat=logitcurve(n.time,genetic.effect)
} else if(phe.model == "liability"){
gammat=logitcurve(n.time,genetic.effect)
}else{
gammat=logitcurve(n.time,genetic.effect)
}
}
if(missing(disease.para)||length(disease.para)==0)
{
disease.para<- list(model = "dominance", gamma =gammat ,p1=0.5, p2=0.1, D=0)
}
if(!onlypower){
snp.mat <- simu_snp(n.sample, snp.count);
}else{
snp.mat<- simu_snp(n.sample, snp.count);
}
#### Ascertainment sampling #####
if(oversampling =="random")
{
cat('* No ascertainment, random sampling:\n')
phe <- simu.binary.phe(n.sample*30, n.time, par, intercept, time_cov, disease.para, phe.model);#changed 12/07
index <- sample(1:(n.sample*30), n.sample)
ID.select <- paste("id", index, sep = "")
phe$y <- phe$y[index,];phe$cov <- phe$cov[which(rownames(phe$cov)%in%ID.select),]; phe$causal.snp<-phe$causal.snp[index,];
cat('Disease Prevalence: ', apply(phe$y, 2, mean), "\n")
}else if(oversampling =="baseline")
{
cat('* Ascertainment based on baseline:\n')
phe <- simu.binary.phe(n.sample*30, n.time, par, intercept, time_cov, disease.para, phe.model);
cat('Disease Prevalence: ', apply(phe$y, 2, mean), "\n")
index.g1 <-which(phe$y[,1] ==1);
index.g0 <- which(phe$y[,1]==0);
index.select.0 <- index.g0[sample(n.sample*0.5)];
index.select.1 <- index.g1[sample(n.sample*0.5)];
index.select<-c(index.select.0, index.select.1)
ID.select <- paste("id", index.select, sep = "")
phe$y <- phe$y[index.select,];phe$cov <- phe$cov[which(rownames(phe$cov)%in%ID.select),];
phe$causal.snp<-phe$causal.snp[index.select,];
cat("After Ascertainment: ", apply(phe$y, 2, mean), "\n")
}else if(oversampling == "sum"){
cat('* Ascertainment based on sum:\n')
phe<- simu.binary.phe(n.sample*80, n.time, par, intercept, time_cov, disease.para, phe.model);
cat('Disease Prevalence: ', apply(phe$y, 2, mean), "\n")
row.sum<-apply(phe$y, 1, sum)
index.g2 <- which(row.sum==n.time);
index.g0 <- which(row.sum==0);
index.g1 <- which(row.sum<n.time& row.sum>0);
index.select.0 <- index.g0[sample(n.sample*0.05)];
index.select.1 <- index.g1[sample(n.sample*0.9)];
index.select.2 <- index.g2[sample(n.sample*0.05)];
index.select <- c(index.select.0, index.select.1, index.select.2);
ID.select <- paste("id", index.select, sep = "")
phe$y <- phe$y[index.select,];phe$cov <- phe$cov[which(rownames(phe$cov)%in%ID.select),]; phe$causal.snp<-phe$causal.snp[index.select,];
cat("After Ascertainment: ", apply(phe$y, 2, mean), "\n")
}else if(oversampling == "sd1"){
cat('* Ascertainment based on sd1:\n')
phe<- simu.binary.phe(n.sample*80, n.time, par, intercept, time_cov, disease.para, phe.model);
cat('Disease Prevalence: ', apply(phe$y, 2, mean), "\n")
row.mean<-apply(phe$y, 1,mean)
index.g0<- which(order(row.mean)<(length(row.mean)*0.01))
index.g2<- which(order(row.mean)>(length(row.mean)*0.99))
index.g1 <-(1:length(row.mean))[-c(index.g0,index.g2)]
index.select.0 <- index.g0[sample(n.sample*0.05)];
index.select.1 <- index.g1[sample(n.sample*0.9)];
index.select.2 <- index.g2[sample(n.sample*0.05)];
index.select <- c(index.select.0, index.select.1, index.select.2);
ID.select <- paste("id", index.select, sep = "")
phe$y <- phe$y[index.select,];phe$cov <- phe$cov[which(rownames(phe$cov)%in%ID.select),]; phe$causal.snp<-phe$causal.snp[index.select,];
# }
cat("After Ascertainment: ", apply(phe$y, 2, mean), "\n")
}else if(oversampling == "sd1.5"){
cat('* Ascertainment based on sd1.5:\n')
phe<- simu.binary.phe(n.sample*80, n.time, par, intercept, time_cov, disease.para, phe.model);
cat('Disease Prevalence: ', apply(phe$y, 2, mean), "\n")
row.mean<-apply(phe$y, 1,mean)
index.g0<- which(order(row.mean)<(length(row.mean)*0.005))
index.g2<- which(order(row.mean)>(length(row.mean)*0.995))
index.g1 <-(1:length(row.mean))[-c(index.g0,index.g2)]
index.select.0 <- index.g0[sample(n.sample*0.05)];
index.select.1 <- index.g1[sample(n.sample*0.9)];
index.select.2 <- index.g2[sample(n.sample*0.05)];
index.select <- c(index.select.0, index.select.1, index.select.2);
ID.select <- paste("id", index.select, sep = "")
phe$y <- phe$y[index.select,];phe$cov <- phe$cov[which(rownames(phe$cov)%in%ID.select),]; phe$causal.snp<-phe$causal.snp[index.select,];
# }
cat("After Ascertainment: ", apply(phe$y, 2, mean), "\n")
} else if(oversampling == "sd2"){
cat('* Ascertainment based on sd2:\n')
phe<- simu.binary.phe(n.sample*80, n.time, par, intercept, time_cov, disease.para, phe.model);
cat('Disease Prevalence: ', apply(phe$y, 2, mean), "\n")
row.mean<-apply(phe$y, 1,mean)
index.g0<- which(order(row.mean)<(length(row.mean)*0.001))
index.g2<- which(order(row.mean)>(length(row.mean)*0.999))
index.g1 <-(1:length(row.mean))[-c(index.g0,index.g2)]
index.select.0 <- index.g0[sample(n.sample*0.05)];
index.select.1 <- index.g1[sample(n.sample*0.9)];
index.select.2 <- index.g2[sample(n.sample*0.05)];
index.select <- c(index.select.0, index.select.1, index.select.2);
ID.select <- paste("id", index.select, sep = "")
phe$y <- phe$y[index.select,];phe$cov <- phe$cov[which(rownames(phe$cov)%in%ID.select),]; phe$causal.snp<-phe$causal.snp[index.select,];
# }
cat("After Ascertainment: ", apply(phe$y, 2, mean), "\n")
} else {
print('Error! Choose oversampling methods from random, baseline and sum')
}
colnames(phe$y) <- paste("Y", 1:(NCOL(phe$y)), sep="") ;
rownames(phe$y) <- paste("ID", 1:NROW(phe$y), sep="");
colnames(phe$cov) <- paste("X", 1:(NCOL(phe$cov)), sep="") ;
rownames(phe$cov) <- rep(paste("ID", 1:n.sample, sep=""), each = n.time);
y.long <- c(t(phe$y));
y.time <- cbind(rep(1:nrow(phe$y), each =n.time), rep((1:n.time)/n.time, nrow(phe$y)))
# cov.long <- phe$cov;
causal.snp <- phe$causal.snp
snp.mat <- cbind(snp.mat, causal.snp);
colnames(snp.mat)[c(ncol(snp.mat)-1, ncol(snp.mat))] <- paste("CAUSAL",1:2, sep="")
# rownames(cov.long) = NULL
return(list(phe.wide = phe$y, phe.long = y.long, phe.time = y.time, phe.cov.long=phe$cov, snp.mat = snp.mat, phe.model = phe.model))
}
# generate random effect
f.simu<-function( sample, n.time, par)
{
ncol <- n.time;
AR1 <- array(0,dim=c(ncol,ncol));
for(i in 1:ncol)
for(j in 1:ncol)
AR1[i,j] <- par$rho^abs(i-j);
sigma.b <- par$sig.b^2*AR1;
r <- stats::rnorm( sample, 0, par$sig.a ) %*% array(1, dim=c(1,ncol))+
mvtnorm::rmvnorm( sample, rep(0, ncol), sigma.b ) ;
return(r);
}
simu_snp<-function(n.sample, snp.count = 1000)
{
file.snp.hap1 <- system.file("extdata", "skat-test-1.hap.gz", package="RVMMAT");
snp.hap <- utils::read.table(file.snp.hap1, header=F);
p.sel<-sample(3:ncol(snp.hap))
snp.mat1<-snp.hap[ sample(n.sample), p.sel];
snp.mat2 <- snp.hap[ sample(n.sample), p.sel];
snp.mat <- snp.mat1 + snp.mat2 -2;
#check var(g) !=0
maf <- colMeans(snp.mat)/2;m.same <- which( maf==1 | maf==0 );
if (length(m.same)>0) snp.mat <- snp.mat[, -m.same, drop=F ];
#check MAF >0.05
maf <- colMeans(snp.mat)/2; m.small<-which(maf<0.05|maf>0.95);
if(length(m.small)>0) snp.mat <- snp.mat[, -m.small, drop=F ];
snp.mat <-snp.mat[,1:snp.count]
rownames(snp.mat)<-paste("ID", 1:nrow(snp.mat), sep = "");
colnames(snp.mat)<-paste("SNP", 1:ncol(snp.mat), sep = "")
return(snp.mat)
}
#generate null phenotype
simu.binary.phe<-function( n.sample, n.time, par, intercept, time_cov, disease.para, phe.model){
sigma.c=matrix(0.5,n.time,n.time);diag(sigma.c)=rep(1,n.time)
cov.mat <- cbind( c(t(mvtnorm::rmvnorm( n.sample, rep(0, n.time), sigma.c ))), rep(ifelse(stats::runif(n.sample)>0.5, 0, 1), each = n.time));
if(intercept){
mu <- f.simu(n.sample, n.time, par) + matrix(cbind(1, cov.mat )%*%c( par$b0, par$b1, par$b2 ), n.sample, n.time, byrow = TRUE)
}else{
mu <- f.simu(n.sample, n.time, par) + cov.mat %*% c(par$b1, par$b2 );
}
if(time_cov == TRUE){
time.effect <- rep(1, n.sample)%*%(t(seq(1, n.time)*par$btime/n.time));
mu = time.effect+mu;
}
#two causal SNPs per simulation:
model <- disease.para$model;gamma <-disease.para$gamma;
p1<-disease.para$p1;p2<-disease.para$p2;D<-disease.para$D;
hap1<-simu_hap(p1, p2, D, n.sample );
hap2<-simu_hap(p1, p2, D, n.sample );
causal.snp<-hap1+hap2;
if(model =="dominance"){
f_g<-apply(causal.snp, 1, function(x) x[1]>0 && x[2]>0)
}else if(model =="additive"){
f_g<-apply(causal.snp, 1, sum)
}
disease.effect<-f_g%*%t(gamma);
mu<-mu + disease.effect;
#prepare output
mu = c(t(mu));
if(phe.model=='logistic')
{
print('logistic phenotypes')
y <- matrix(stats::rbinom(n.sample*n.time, 1, inv.logit(mu)), n.sample,n.time, byrow = T)
}else if (phe.model=='liability'){#liability model
print('liability phenotypes')
#add sigma.e to the mu;
# print(head(mu))
mu <- mu + stats::rnorm( n.sample*n.time, 0, par$sig.e);
# print(head(mu))
y<- matrix(ifelse(mu>0, 1, 0), n.sample, n.time, byrow =T)
}else if (phe.model=='Gaussian') {#gaussian model
print('Gaussian phenotypes')
mu <- mu + stats::rnorm( n.sample*n.time, 0, par$sig.e);
# print(head(mu))
y<- matrix(mu, n.sample, n.time, byrow =T)
}else {
print('Wrong parameter, please choose from logistic, liability and Gaussian\n')
}
causal.snp<-causal.snp;
rownames(cov.mat) <- rep(paste("id", 1:n.sample, sep=""), each = n.time);
return(list(y=y, cov=cov.mat, causal.snp=causal.snp, phe.model = phe.model));
}
inv.logit=function(x){
return(exp(x)/(1+exp(x)))
}
logit=function(x){
return(log(x/(1-x)))
}
simu_hap<-function(p1, p2, D, n.sample){
if(p1*p2<D){print("invalid setting")}
aa<-p1*p2+D;bb<-(1-p1)*(1-p2)+D;ab<-p1*(1-p2)-D;ba<-(1-p1)*p2-D
type<-sample(1:4, n.sample, replace = T, prob = c(aa,ab,ba, bb));
out = matrix(0, n.sample, 2)
for(i in 1:n.sample){
if(type[i] ==1){out[i,]<-c(1,1)}
if(type[i] ==2){out[i,]<-c(1,0)}
if(type[i] ==3){out[i,]<-c(0,1)}
if(type[i] ==4){out[i,]<-c(0,0)}
}
return(out)
}
logitcurve=function(ntime,genetic.effect){
timeseq=(1:ntime)/(ntime)
ef=genetic.effect/(1+genetic.effect*exp(8-12*timeseq))
return(ef)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.