tests/testthat/test_10snps.R

# Samp.size: N  100
# Number of CpGs: 123 CpGs
# Number of SNPs: 20,60,100, 500, 1000, 5000
# Prop of significant SNPs:         5%
# Number:  1, 3, 5, 25, 50, 250

# Evaluate TPR




# Scenario 4:

# Simulate (Beta)-Binomial Outcomes

# the true methylated counts depends on 25 SNPs


# generate the non-zero betas

# 1. all with the exact same shape

# 2. varying shapes with either positive or negative effects on methylation




my.samp <- 50

n.snp <- 5


n.sig.snp <- round(n.snp *0.2)


library(magrittr)

BSMethSim_bbinom <-function(n, posit, theta.0, beta, phi, random.eff = F, mu.e=0,
                            sigma.ee=1,p0 = 0.003, p1 = 0.9, X, Z,binom.link="logit"){
  if( !is.matrix((Z)) ) message ("covariate Z is not a matrix")
  #  if( !is.matrix(beta) ) message ("the covariate effect parameter beta is not a matrix")
  
  if( !(nrow(X)==nrow(Z) & nrow(X) == n) ) message("Both X and Z should have n rows")
  if( !(ncol(X)==length(theta.0) & ncol(X) ==nrow (beta) & ncol(X)==length(posit) )) message ("The columns of X should be the same as length of beta theta.0 and posit; They all equals to the number of CpGs")
  if( ncol(beta)!= ncol(Z)) message("beta and Z should have the same dimentions")
  
  # the random effect term
  if(random.eff == T){
    my.e <- rnorm(n, mean=mu.e, sd = sqrt(sigma.ee))
  }else{
    my.e <- rep(mu.e, n)
  }
  
  
  
  my.theta <- t(sapply(1:n, function(i){
    theta.0 + rowSums(sapply(1:ncol(Z), function(j){Z[i,j] * beta[,j]})) + my.e[i]
  }))
  
  
  # Transform my.theta to my.pi for each (i, j)
  
  my.pi <- t(sapply(1:nrow(my.theta), function(i){
    #exp(my.theta[i,])/(1+exp(my.theta[i,]))
    binomial(link=binom.link)$linkinv(my.theta[i,])
  }))
  #  Generate S-ij based on the my.pi and my.Z
  #---------------------------------------------------#
  my.S <- my.pi
  for ( i in 1:nrow(my.S)){
    for ( j in 1:ncol(my.S)){
      #my.S[i,j] <- rbinom (1, size = X[i,j], prob= my.pi[i,j])
      my.S[i,j] <-VGAM::rbetabinom(1, size = X[i,j], prob = my.pi[i,j], rho = (phi[j]-1)/(X[i,j]-1) )
    }
  }
  #---------------------------------------------------#
  # Generate Y-ij based on the S-ij and the error rate (1-p1) and p0
  #---------------------------------------------------#
  my.Y <- my.S
  for ( i in 1:nrow(my.Y)){
    for ( j in 1:ncol(my.Y)){
      my.Y[i,j] <- sum(rbinom(my.S[i,j], size =1, prob=p1)) +
        sum(rbinom(X[i,j]-my.S[i,j], size = 1, prob=p0))
    }
  }
  out = list(S = my.S, Y = my.Y, theta = my.theta, pi = my.pi)
}

source("/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/SOMNiBUS_RE_Simu/functions/BSMethEM_April_update_scale_estp0p1_export_Q_known_error_with_phi_Mstep_quasi_correct_Fletcher.R")

#-------------------
# Step1 : Load a real data set 
#---------------------
load("/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/SOMNiBUS_RE_Simu/functions/BANK1data.RData")


# useful objects in this "BANK1data.RData" that I will use to simulate methylation data  

# methMat: matrix of methylated counts; rows are CpG sites, columns are samples
# totalMat: matrix of read-depth 
# pos: positions for the CpG sites in the methMat/totalMat
#my.pheno: the covariate files for the samles in methMat

#RAdat: this object is for running SOMNiBUS; it is not useful for simulation


#-----------------------------------------------------------------------------------------------
# Step2 : Specify the methylation patterns through functional parameters beta.0(t), beta.1(t), ...
#-------------------------------------------------------------------------------------------------

# I specify those values as output from real-data analysis using SOMNiBUS

load("/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/SOMNiBUS_RE_Simu/functions/BANK1betas.RData")

# Load the functional parameters beta.0, beta.1, and beta.2 for intercept, effect of cell type and disease


beta.0 <- beta.0/4 + 2
#beta.0 <- beta.0-3.5



betas_non_zeros <- matrix(NA, nrow = length(beta.0), ncol = n.sig.snp)

for (i in 1:ncol(betas_non_zeros)){
  betas_non_zeros[,i] <- beta.1
}




beta_all <- cbind(beta.0, betas_non_zeros)


#beta_all[,1] <- beta_all[,1]-30

#beta_all[,1] <- 0

# dispersion parameters are fixed as 1
phi <- rep(1,length(pos))


#-----------------------------------------------------------------------#


add_read_depth =0 # 



#--------


#-------------------
# Step3 : Simulation
#---------------------

#---------------------------
# Step 3.1: simulate covariate matrix Z
#---------------------------

# the real-data covariate matrix is my.pheno; this step is to simulate Z based on my.pheno, with some simulation randomness



Z <- data.frame(matrix(NA, nrow= my.samp, ncol = n.snp))



set.seed(32314242)

mafs <- runif(n.snp,0.1, 0.5)
for ( i in 1:ncol(Z)){
  Z[,i] <- sample(c(0,1,2), size = my.samp, prob = c(mafs[i]^2, 2*mafs[i]*(1-mafs[i]), (1-mafs[i])^2), replace = T)
}


Z <-as.matrix(Z);rownames(Z)<- NULL

samp.Z <- Z


#---------------------------
# Step 3.2: Read depth matrix X
#---------------------------
# Build a read-depth matrix which sort of preserve the dependence structure in read-depths

my.X <- matrix(sample(0:1, my.samp*length(pos), replace = T), nrow = my.samp, ncol = length(pos))

ff = smooth.spline(pos, apply(totalMat, 1, median), nknots = 10)

spacial_shape <- round(predict(ff, pos)$y)
for ( i in 1:my.samp){
  my.X[i,] <- my.X[i,] + spacial_shape 
  #+ round(10*beta.0) + round(beta.1 * Z[i,1] + beta.2 * Z[i, 2])
} 

my.X <-  (my.X + add_read_depth)


#---------------------------
# Step 3.2: Simulate the methylated count matrix
#---------------------------

sim.dat<-BSMethSim_bbinom(n= my.samp, posit = pos, theta.0 =beta_all[,1], beta= as.matrix(beta_all[,-1]), phi=phi, 
                          X = my.X, Z =as.matrix(Z[, 1:n.sig.snp], ncol = n.sig.snp),p0 = 0, p1 = 1,random.eff = F)



plot(pos, sim.dat$pi[1,], ylim = c(0,1))

for( i in 1:my.samp){
  points(pos, sim.dat$pi[i,], pch = 19, cex =0.5, col = i)
}



#-------------------------------------
# Generate loss function; proximal gradient descent; and etc.


#-- Step 1: generate the matrix of Omega1 and Omega2.


# n.k: K number of knots ---- 
# we choose the same k and same basis function B for all predictors 
# so, the same mat_omega2 for all p = 1, 2, ... P
length(pos)

# because curretnly, I didn't add smoothness penalty for the intercept, I will use n.k = 5 for the intercept
n.k = 10 # for the rest of Zs (non-intercept predictors)





#--- Organize the data before EM-smooth ---#
X <-my.X; Y <- sim.dat$Y 
samp.size <- nrow(Y); my.p <- ncol(Y)

dat.use <- data.frame(Meth_Counts=as.vector(t(Y)), 
                      Total_Counts=as.vector(t(X)), 
                      Position = rep(pos, samp.size),
                      ID = rep(1:samp.size, each=my.p))


covs_use <- colnames(samp.Z)
for( j in 1:length(covs_use)){
  dat.use <- data.frame(dat.use, rep(samp.Z[,j], each = my.p))
}
colnames(dat.use)[-c(1:4)] <- covs_use

dat.use <- dat.use[dat.use$Total_Counts>0,]

#my.span.dat<- data.frame(my.span.dat, null = sample(c(0,1), size = nrow(my.span.dat), replace = T))

#Z <- dat.use[,-c(1:4)]


# pre-set parameters
dat = dat.use
n.k = 10
numCovs = ncol(dat)-4
shrinkScale=1/2


saveRDS(dat, file = "datSnp5nsig1.RDS")

n.k = 5
lambda1 = 20

lambda2 = 0.1
stepSize=2
theta <- initTheta <- rep(0, n.k*(ncol(dat)-3))
shrinkScale=0.5

maxInt = 20
epsilon = 1E-6
printDetail = TRUE

accelrt = FALSE
setwd("/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/SOMNiBUS_SNP_selection/Rcpppackage/sparseSOMNiBUS/src")
library(Rcpp)
sourceCpp("sparseOmegaCr.cpp")
source("/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/SOMNiBUS_SNP_selection/Rcpppackage/sparseSOMNiBUS/R/utils.R")
source("/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/SOMNiBUS_SNP_selection/Rcpppackage/sparseSOMNiBUS/R/sparseSmoothFit.R")
source("/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/SOMNiBUS_SNP_selection/Rcpppackage/sparseSOMNiBUS/R/oneUpdate.R")

fit_out <- sparseSmoothFit(dat, n.k=n.k, stepSize=stepSize,lambda1=lambda1, lambda2=lambda2, maxInt = maxInt,
                            epsilon = 1E-6, printDetail = FALSE, initTheta=initTheta, shrinkScale = shrinkScale,
                           accelrt = accelrt)

plot(fit_out$lossVals[,1]+fit_out$lossVals[,2], type = "l", 
     main= paste0("step size = ", stepSize, " MaxInt = ", maxInt),
     xlab = "Iteration", ylab = "Binomial Loss")


plot(fit_out$thetaEst)


maxInt = 1000
accelrt = TRUE
fit_out_accelrt_correct <- sparseSmoothFit(dat, n.k=n.k, stepSize=stepSize,lambda1=lambda1, lambda2=lambda2, maxInt = maxInt,
                                    epsilon = 1E-6, printDetail = FALSE, initTheta=initTheta, shrinkScale = shrinkScale,
                                    accelrt = accelrt)

#fit_out_accelrt # the usual line search for unaccelerated approach

#fit_out_accelrt_correct # the line search for accelerated approach 

plot(fit_out_accelrt_correct$lossSum[500:1000])

points(fit_out_accelrt$lossVals[500:1000,1] + fit_out_accelrt$lossVals[500:1000,2], col = 3)

points(fit_out$lossVals[500:1000,1]+fit_out$lossVals[500:1000,2], col = 2)

plot(fit_out_accelrtF$lossVals[,1]+fit_out_accelrtF$lossVals[,2], type = "l", 
     main= paste0("step size = ", stepSize, " MaxInt = ", maxInt),
     xlab = "Iteration", ylab = "Binomial Loss")

saveRDS(fit_out, "/mnt/GREENWOOD_JBOD1/GREENWOOD_SCRATCH/kaiqiong.zhao/Projects/
        SOMNiBUS_SNP_selection/Rcpppackage/snpSOMNiBUS/tests/testthat/nSNP10_10minus4.RDA")
fit_outsmall = fit_out

out0 <- smoothConstructExtract(n.k, dat$Position, constrains = T)
out1 <- smoothConstructExtract(n.k, dat$Position, constrains = F)

basisMat0 <- out0$basisMat
basisMat1 <- out1$basisMat
smoOmega1 <- out1$smoothOmegaCr
sparOmega <- sparseOmegaCr(out1$myh, n.k, out1$matF) # the same for both intercept and non_intercept # Call a RCpp function
numCovs <- ncol(dat) - 4
designMat1 <- extractDesignMat1(numCovs, basisMat1, dat)

mydesignMat <- cbind(basisMat0, designMat1[[1]], designMat1[[2]], 
                     designMat1[[3]],designMat1[[4]], 
                     designMat1[[5]], designMat1[[6]],
                     designMat1[[7]],designMat1[[8]], 
                     designMat1[[9]], designMat1[[10]])

mydesignMatScaled <- mydesignMat
mydesignMatScaled[,-1] <- scale(mydesignMat[,-1], scale = T)

xtx <- t(mydesignMat) %*% mydesignMat
max(eigen(xtx)$values)
L = 1/max(eigen(xtx)$values)/4
stepSize_threshold <- 1/L
stepSize_threshold

xtx <- t(mydesignMatScaled) %*% mydesignMatScaled
max(eigen(xtx)$values)
L = 1/max(eigen(xtx)$values)/4
stepSize_threshold <- 1/L
stepSize_threshold


apply(mydesignMatScaled, 2, mean)
apply(mydesignMatScaled, 2, sd)
kaiqiong/sparseSOMNiBUS documentation built on Dec. 7, 2020, 4:40 a.m.