R/source1.R

Defines functions generate_data_s2 get_null_scores mamba generate_data_re2 generate_data_re1 generate_data_be generate_data_fe generate_data_mamba

Documented in generate_data_be generate_data_fe generate_data_mamba generate_data_re1 generate_data_re2 mamba

#loadtmb<-function(dir=getwd()){
#  compile(paste0(dir, "/tau2f.cpp"))
#  dyn.load(dynlib(paste0(dir, "/tau2f")))
#  data("res2ref_cpd")
#
#}


#M<-50000; alpha<-10
#lambda<-0.95; tau2<-2.5e-4; p<-0.01; resref=NULL;n=rep(50*10^3,10)
#generate_data_mamba(M,p,tau2,lambda,alpha,n=rep(50*10^3,10))

#' Simulate summary statistics according to the MAMBA model.
#' @param M number of SNPs for which to simulate summary stats.
#' @param p proportion of SNPs with real non-zero effects.
#' @param tau2 variance of real associated non-zero SNPs.
#' @param lambda proportion of NON-outlier studies for non-replicable SNPs. 
#' @param n vector of sample sizes from the contributing studies. 
#' @param alpha variance inflation factor of outlier studies.
#' @param resref an optional vector of residual variances to sample from (with replacement) 
#'   when generating the standard errors for the summary stats.  Default is NULL, in which case the residual variances calculated from a Cigarretes Per Day GWAS (Liu, Jiang 2019)  are used.
#' @return 
#'  A list containing 
#'  An Mxk matrix of effect size estimates \code{betajk}, 
#'  An Mxk matrix of effect size estimate  variances \code{sjk2}, 
#'  M-length vector inverse-variance weighted meta-analysis z-scores \code{meta.z}, 
#'  an M-length binary vector indicating real / non-real effect at each SNP \code{Rj},
#'  an M-length binary vector indicating true effect-size at each SNP \code{muj}.  
#'  Binary matrix \code{Ojk} indicating whether study was normal(Ojk=1) or outlier (Ojk=0).
#' @examples
#' generate_data_mamba(M=100, p=0.01, tau2=2.5e-4, n=rep(10^4, 10), lambda=0.975, alpha=15) 
#' @export


generate_data_mamba<-function(M=50*10^3, p=0.01, tau2=2.5e-4, resref=NULL, n=rep(50*10^3,10), 
			     lambda=0.975, alpha=5){

  if(missing(resref)){
	 data(res2ref_cpd, package="mamba",envir=environment())
	 resref<-sqrt(res2ref_cpd$res2);
   } 
  Rj<-rbinom(M, size=1, prob=p)
  muj<-Rj*rnorm(M, 0, sqrt(tau2))
  k<-length(n)
  res.sd<-lapply(1:M, function(j){ sample(resref, k, replace = TRUE) })
  sjk2<-lapply(1:M, function(j){res.sd[[j]]^2 / (n)})
  Ojk<-lapply(1:M, function(j){rbinom(k, size=1, prob=lambda)})
  betajk<-lapply(1:M, function(j){
	Rj[j] * rnorm(k, muj[j], sd=sqrt(sjk2[[j]])) + 
      (1-Rj[j]) * ((Ojk[[j]])*rnorm(k, 0, sqrt(sjk2[[j]])) + 
	(1-Ojk[[j]])*rnorm(k, 0, sqrt(alpha)*sqrt(sjk2[[j]])))})
  
  meta.z<-sapply(1:length(betajk), function(j){
    sum(betajk[[j]]/sjk2[[j]]) / sqrt(sum(1/sjk2[[j]]))
  })

  betajk<-matrix(unlist(betajk), ncol=k, byrow=TRUE)
  sjk2<-matrix(unlist(sjk2), ncol=k, byrow=TRUE)
  Ojk<-matrix(unlist(Ojk), ncol=k, byrow=TRUE)
 
  return(list(betajk=betajk,
              sjk2=sjk2,
              meta.z=meta.z, 
              Rj=Rj, 
              Ojk=Ojk,
              muj=muj))
}

#' Simulate summary statistics according to the (inverse-variance weighted) fixed effects model. 
#' @param M number of SNPs for which to simulate summary stats.
#' @param p proportion of SNPs with real non-zero effects.
#' @param tau2 variance of real associated non-zero SNPs.
#' @param n vector of sample sizes from the contributing studies. 
#' @param resref an optional vector of residual variances to sample from (with replacement) 
#'   when generating the standard errors for the summary stats.  Default is NULL, in which case the residual variances calculated from a Cigarretes Per Day GWAS (Liu, Jiang 2019)  are used.
#' @return
#'  A list containing:
#'  An Mxk matrix of effect size estimates \code{betajk}, 
#'  An Mxk matrix of effect size estimate  variances \code{sjk2}, 
#'  M-length vector inverse-variance weighted meta-analysis z-scores \code{meta.z}, 
#'  an M-length binary vector indicating real / non-real effect at each SNP \code{Rj},
#'  an M-length binary vector indicating true effect-size at each SNP \code{muj}.  
#' @examples
#' generate_data_fe(M=100, p=0.01, tau2=2.5e-4, resref=NULL, n=rep(50*10^3,10)) 
#' @export


generate_data_fe<-function(M=50*10^3, p=0.01, tau2=2.5e-4, resref=NULL, n=rep(50*10^3,10)){
  
  if(missing(resref)){
	 data(res2ref_cpd, package="mamba",envir=environment())
	 resref<-sqrt(res2ref_cpd$res2);
   } 
  Rj<-rbinom(M, size=1, prob=p)
  muj<-Rj*rnorm(M, 0, sqrt(tau2))
  k<-length(n)
  res.sd<-lapply(1:M, function(j){ sample(resref, k, replace = TRUE) })
  sjk2<-lapply(1:M, function(j){res.sd[[j]]^2 / (n)})
  betajk<-lapply(1:M, function(j){
    rnorm(k, muj[j], sd=sqrt(sjk2[[j]]))
  })
  
  meta.z<-sapply(1:length(betajk), function(j){
    sum(betajk[[j]]/sjk2[[j]]) / sqrt(sum(1/sjk2[[j]]))
  })

  betajk<-matrix(unlist(betajk), ncol=k, byrow=TRUE)
  sjk2<-matrix(unlist(sjk2), ncol=k, byrow=TRUE)
  
  return(list(betajk=betajk,
              sjk2=sjk2,
              meta.z=meta.z, 
              Rj=Rj, 
              muj=muj))
}

#' Simulate summary statistics according to the binary-effects (be) model.
#' @param M number of SNPs for which to simulate summary stats.
#' @param p proportion of SNPs with real non-zero effects.
#' @param tau2 variance of real associated non-zero SNPs.
#' @param n vector of sample sizes from the contributing studies. 
#' @param resref an optional vector of residual variances to sample from (with replacement) 
#'   when generating the standard errors for the summary stats.  Default is NULL, in which case the residual variances calculated from a Cigarretes Per Day GWAS (Liu, Jiang 2019)  are used.
#' @return
#'  A list containing:
#'  \itemize{
#'    \item An Mxk matrix of effect size estimates \code{betajk}, 
#'    \item An Mxk matrix of effect size estimate  variances \code{sjk2}, 
#'     \item M-length vector inverse-variance weighted meta-analysis z-scores \code{meta.z}, 
#'     \item an M-length binary vector indicating real / non-real effect at each SNP \code{Rj},
#'  \item an M-length binary vector indicating true effect-size at each SNP \code{muj}.  
#'  \item an M-length vector indicating the number of studies at each SNP with non-zero effects .
#'  \item an Mxk binary matrix indicating which studies at which SNP had non-zero effects.
#' }
#' @examples
#' generate_data_mamba(M=100, p=0.01, tau2=2.5e-4, n=rep(10^4, 10), lambda=0.975, alpha=15) 
#' @export

generate_data_be<-function(M=50*10^3, p=0.01, tau2=2.5e-4, resref=NULL, n=rep(50*10^3,10)){
  if(missing(resref)){
	 data(res2ref_cpd, package="mamba",envir=environment())
	 resref<-sqrt(res2ref_cpd$res2);
   } 
  Rj<-rbinom(M, size=1, prob=p)
  muj<-Rj*rnorm(M, 0, sqrt(tau2))

  k<-length(n)
  res.sd<-lapply(1:M, function(j){ sample(resref, k, replace = TRUE) })
  sjk2<-lapply(1:M, function(j){res.sd[[j]]^2 / (n)})
  Nej<-sapply(1:M, function(j){ Rj[j]*sample(1:k, 1) })
  bej<-lapply(1:M, function(j){
    be<-rep(0, k)
    if(Nej[j] > 0){
      be[sample(1:k, Nej[j])]<-1
    }
     return(be)
  })
  
  betajk<-lapply(1:M, function(j){bej[[j]] * rnorm(k, muj[j], sd=sqrt(sjk2[[j]])) + 
      (1-bej[[j]]) *rnorm(k, 0, sqrt(sjk2[[j]]))
  })
  
  meta.z<-sapply(1:length(betajk), function(j){
    sum(betajk[[j]]/sjk2[[j]]) / sqrt(sum(1/sjk2[[j]]))
  })

  betajk<-matrix(unlist(betajk), ncol=k, byrow=TRUE)
  sjk2<-matrix(unlist(sjk2), ncol=k, byrow=TRUE)
  bej<-matrix(unlist(bej), ncol=k, byrow=TRUE)

  return(list(betajk=betajk,
              sjk2=sjk2,
              meta.z=meta.z, 
              Rj=Rj, 
              muj=muj, 
              Nej=Nej,
              bej=bej))
}

#' Simulate summary statistics according to random effects model.
#' @param M number of SNPs for which to simulate summary stats.
#' @param p proportion of SNPs with real non-zero effects.
#' @param tau2 variance of real associated non-zero SNPs.
#' @param n vector of sample sizes from the contributing studies. 
#' @param alpha variance inflation factor of outlier studies.
#' @param resref an optional vector of residual variances to sample from (with replacement) 
#'   when generating the standard errors for the summary stats.  Default is NULL, in which case the residual variances calculated from a Cigarretes Per Day GWAS (Liu, Jiang 2019)  are used.
#' @param I2 the I2 heterogeneity statistic for each SNP. 
#'  The variance of study-level effects around population level effect at each SNP 
#'  is specified given I2 level (between 0,1) and the simulated standard errors.   
#' @return
#'  A list containing:
#'  An Mxk matrix of effect size estimates \code{betajk}, 
#'  An Mxk matrix of effect size estimate  variances \code{sjk2}, 
#'  M-length vector inverse-variance weighted meta-analysis z-scores \code{meta.z}, 
#'  an M-length binary vector indicating real / non-real effect at each SNP \code{Rj},
#'  an M-length binary vector indicating true effect-size at each SNP \code{muj}.  
#'  an Mxk matrix of the true study-level effects \code{etajk}
#'  an M-length vector of the variance of the study-level effects around the SNP's population level effect \code{omega2j}
#' @examples
#'   generate_data_re1(M=100, p=0.01, tau2=2.5e-4, n=rep(10^4, 10), I2=0.2) 
#' @export

generate_data_re1<-function(M=50*10^3, p=0.01, tau2=2.5e-4, resref=NULL, n=rep(50*10^3,10),I2=0.3){
  
  if(missing(resref)){
	 data(res2ref_cpd, package="mamba",envir=environment())
	 resref<-sqrt(res2ref_cpd$res2);
   } 
  Rj<-rbinom(M, size=1, prob=p)
  muj<-Rj*rnorm(M, 0, sqrt(tau2))
  k<-length(n)
  res.sd<-lapply(1:M, function(j){ sample(resref, k, replace = TRUE) })
  sjk2<-lapply(1:M, function(j){res.sd[[j]]^2 / (n)})
  S2j<-sapply(1:M, function(j){
    ((k-1)*sum(1/sjk2[[j]]))/(sum(1/sjk2[[j]])^2 - sum(1/sjk2[[j]]^2))
  })
  omega2j<-sapply(1:M, function(j){
    I2/(1-I2) * S2j[j]
  })
  etajk<-lapply(1:M, function(j){
    rnorm(k, muj[j], sd=sqrt(omega2j[j])) 
  })
  
  betajk<-lapply(1:M, function(j){
    rnorm(k, etajk[[j]], sd=sqrt(sjk2[[j]]))
  })
  
  meta.z<-sapply(1:length(betajk), function(j){
    sum(betajk[[j]]/sjk2[[j]]) / sqrt(sum(1/sjk2[[j]]))
  })


  betajk<-matrix(unlist(betajk), ncol=k, byrow=TRUE)
  sjk2<-matrix(unlist(sjk2), ncol=k, byrow=TRUE)
  etajk<-matrix(unlist(etajk), ncol=k, byrow=TRUE)

  return(list(betajk=betajk,
              sjk2=sjk2,
              meta.z=meta.z, 
              etajk=etajk,
              omega2j=omega2j,
              S2j=S2j,
              Rj=Rj, 
              muj=muj))
}

#' Simulate summary statistics according to the Han and Eskin's RE2 model.  (Heterogeneity only when the population mean effect is non-zero)
#' @param M number of SNPs for which to simulate summary stats.
#' @param p proportion of SNPs with real non-zero effects.
#' @param tau2 variance of real associated non-zero SNPs.
#' @param lambda proportion of NON-outlier studies for non-replicable SNPs. 
#' @param n vector of sample sizes from the contributing studies. 
#' @param alpha variance inflation factor of outlier studies.
#' @param resref an optional vector of residual variances to sample from (with replacement) 
#'   when generating the standard errors for the summary stats.  Default is NULL, in which case the residual variances calculated from a Cigarretes Per Day GWAS (Liu, Jiang 2019)  are used.
#' @return 
#'  A list containing:
#'  An Mxk matrix of effect size estimates \code{betajk}, 
#'  An Mxk matrix of effect size estimate  variances \code{sjk2}, 
#'  M-length vector inverse-variance weighted meta-analysis z-scores \code{meta.z}, 
#'  an M-length binary vector indicating real / non-real effect at each SNP \code{Rj},
#'  an M-length binary vector indicating true effect-size at each SNP \code{muj}.  
#'  an Mxk matrix of the true study-level effects \code{etajk}
#'  an M-length vector of the variance of the study-level effects around the SNP's population level effect \code{omega2j}
#' @examples
#' generate_data_re2(M=100, p=0.01, tau2=2.5e-4, n=rep(10^4, 10), I2=0.2) 
#' @export

generate_data_re2<-function(M=50*10^3, p=0.01, tau2=2.5e-4, resref=NULL, n=rep(50*10^3,10),I2=0.3){
  
  if(missing(resref)){
	 data(res2ref_cpd, package="mamba",envir=environment())
	 resref<-sqrt(res2ref_cpd$res2);
   } 
  Rj<-rbinom(M, size=1, prob=p)
  muj<-Rj*rnorm(M, 0, sqrt(tau2))
  k<-length(n)
  res.sd<-lapply(1:M, function(j){ sample(resref, k, replace = TRUE) })
  sjk2<-lapply(1:M, function(j){res.sd[[j]]^2 / (n)})
  S2j<-sapply(1:M, function(j){
    ((k-1)*sum(1/sjk2[[j]]))/(sum(1/sjk2[[j]])^2 - sum(1/sjk2[[j]]^2))
  })
  omega2j<-sapply(1:M, function(j){
    I2/(1-I2) * S2j[j]
  })
  etajk<-lapply(1:M, function(j){
   Rj[j] * rnorm(k, muj[j], sd=sqrt(omega2j[j])) 
  })
  
  betajk<-lapply(1:M, function(j){
    rnorm(k, etajk[[j]], sd=sqrt(sjk2[[j]]))
  })
  
  
  meta.z<-sapply(1:length(betajk), function(j){
    sum(betajk[[j]]/sjk2[[j]]) / sqrt(sum(1/sjk2[[j]]))
  })


  betajk<-matrix(unlist(betajk), ncol=k, byrow=TRUE)
  sjk2<-matrix(unlist(sjk2), ncol=k, byrow=TRUE)
  etajk<-matrix(unlist(etajk), ncol=k, byrow=TRUE)
  
  return(list(betajk=betajk,
              sjk2=sjk2,
              meta.z=meta.z, 
              etajk=etajk,
              omega2j=omega2j,
              S2j=S2j,
              Rj=Rj, 
              muj=muj))
}

#d<-generate_data_mamba()
#betajk<-matrix(unlist(d$betajk),
#		byrow=TRUE,
#		nrow=length(d$betajk))
#sjk2<-matrix(unlist(d$sjk2),
#		byrow=TRUE,
#		nrow=length(d$betajk))
#parcores=1; 
#                   p=0.003;
#                   f=0.96;
#                   tau2=0.0002;
#                   alpha=3;
#                   #conv.eps=1e-3;
#                   rel.eps=1e-8;
#                   verbose=1;
#                   snpids=NA;
#                   maxIter=10^4L


#betajk<-betaij
#sjk2<-sij2
#
#parcores=1; 
#p=0.003;
#lambda=0.96;
#tau2=0.0002;
#alpha=3;
##conv.eps=1e-3;
#rel.eps=1e-8;
#verbose=1;
#snpids=NA;
#maxIter=10^4L


#' Fit the MAMBA model.
#' @param betajk Mxk matrix of effect size estimates, where row j corresponds to j-th SNP, and column k corresponds to k-th study .  Missing values can be represented with NA.
#' @param sjk2 Mxk matrix of effect size estimate variances, where row j corresponds to SNP j, and column k corresponds to study k.  Missing values can be represented with NA.
#' @param p initial value for EM algorithm, for the proportion of non-zero SNPs
#' @param lambda initial value for EM algorithm, for the proportion of non-replicable SNPs which are well behaved, or non-outliers.
#' @param tau2 initial value for EM algorithm, for the variance of replicable non-zero effect SNPs.
#' @param alpha initial value for EM algorithm, for the variance inflation of outlier summary statistics at non-replicable SNPs.
#' @param rel.eps threshold for when to end EM algorithm. rel.eps = (ll[i] - ll[i-1])/ll[i], where ll[i] is the log-likelihood at iteration i.
#' @param snpids an optional vector of SNP id names.  If not provided, the ID's will be 1:M, corresponding to the order of the matrix betajk.
#' @param maxIter maximum # of EM iterations.
#' @return
#'     A list containing 
#' @examples 
#'   d<-generate_data_mamba()
#'   mod<-mamba(betajk=d$betajk, sjk2=d$sjk2)
#' @export 


mamba<-function(betajk, sjk2, 
                   parcores=1, 
                   p=0.003,
                   lambda=0.96,
                   tau2=0.0002,
                   alpha=3,
                   #conv.eps=1e-3,
                   rel.eps=1e-8,
                   verbose=1,
                   snpids=NA,
                   maxIter=10^4L,
		   minIter=100){

  betajk<-as.matrix(betajk)
  sjk2<-as.matrix(sjk2)

## Replace any summary stats with standard error = 0 with missing
  zeroL<-mclapply(1:nrow(sjk2), function(j){
    which(sjk2[j,]==0)
  }, mc.cores = parcores)
 chk<- which(sapply(zeroL, length) > 0)
  if(length(chk) > 0){
    for(j in chk){
      sjk2[j,zeroL[[j]]]<-NA
      betajk[j,zeroL[[j]]]<-NA
    }
  }

## Replace any summary stats with standard error = inf or beta=inf with missing
 infL<-mclapply(1:nrow(sjk2), function(j){
   which(is.infinite(sjk2[j,]))
 }, mc.cores = parcores)
 chk<- which(sapply(infL, length) > 0)
 if(length(chk) > 0){
   for(j in chk){
     sjk2[j,infL[[j]]]<-NA
     betajk[j,infL[[j]]]<-NA
   }
 }
 infL<-mclapply(1:nrow(betajk), function(j){
   which(is.infinite(betajk[j,]))
 }, mc.cores = parcores)
 chk<- which(sapply(infL, length) > 0)
 if(length(chk) > 0){
   for(j in chk){
     sjk2[j,infL[[j]]]<-NA
     betajk[j,infL[[j]]]<-NA
   }
 }

### Identify indices of studies with non-missing summary stats at each SNP
  mis.inds<-mclapply(1:nrow(sjk2), function(j){
    which(!is.na(sjk2[j,]))
  }, mc.cores = parcores)
  
  
  b2s2<-sapply(1:nrow(betajk), function(j){
    sum(betajk[j,mis.inds[[j]]]^2/sjk2[j,mis.inds[[j]]])
  })
  bs22<-sapply(1:nrow(betajk), function(j){
    sum(betajk[j,mis.inds[[j]]]/sjk2[j,mis.inds[[j]]])^2
  })
  os22<-sapply(1:nrow(betajk), function(j){
    sum(1/sjk2[j,mis.inds[[j]]])
  })
  
  
  ll<-NA
  
  k_j<-sapply(mis.inds, length)
  
  MM<-nrow(betajk)
  
  mu.hat<-sapply(1:MM, function(j){
    sum(betajk[j,mis.inds[[j]]]/sjk2[j,mis.inds[[j]]])/(sum(1/sjk2[j,mis.inds[[j]]]) + 1/(tau2))
  })
  
  strt<-Sys.time()
  for(iter in 1:(maxIter)){
    
    deltajk<- mclapply(1:nrow(betajk), function(j){
	mamba:::deltis(betajk_j=betajk[j,], sjk2_j=sjk2[j,], alpha=alpha, lambda=lambda)
    })
    deltajk<-matrix(unlist(deltajk), ncol=MM, byrow=FALSE)
    
    missingdelta<-unlist(mclapply(1:MM, function(j){ 
			mean(is.na(deltajk[mis.inds[[j]],j]))
	      }, mc.cores = parcores))
    if(sum(missingdelta) > 0) {
      print("na's in deltajk")
      break
    }
    if(verbose > 1){
      print("deltajk calculated.")
    }
    if(iter==1){
      llbR1<-unlist(mclapply(1:nrow(betajk), function(j){
        mamba:::llbR1_j(betajk_j = betajk[j,mis.inds[[j]]], 
		sjk2_j=sjk2[j,mis.inds[[j]]], 
		tau2inv = 1/tau2)
      }, mc.cores=parcores)) - (k_j/2)*log(2*pi)
      
      llbR0<-unlist( mclapply(1:nrow(betajk), function(j){
        mamba:::llbR0_j(betajk_j = betajk[j,mis.inds[[j]]], 
		sjk2_j = sjk2[j,mis.inds[[j]]], 
		alpha = alpha, 
		lambda=lambda)
      }, mc.cores = parcores))
    }
    
    gammaj<-unlist(mclapply(1:MM, function(j){
      gam<- p*exp(llbR1[j])/(exp(mamba:::logsumexp(c(log(p) + llbR1[j], log(1-p) + llbR0[j]))))
      if(is.na(gam)){
        m<-which.max(c(log(1-p) + llbR0[j],
                       log(p) + llbR1[j]))                      
        gam<-(m-1)
      }
      gam
    }, mc.cores = parcores))
    if(verbose > 1){
      print(data.table(gammaj)[order(-gammaj)])
    }
    
    if(verbose > 0){ 
      print("e step finished.")
    } 
    
    if(sum(is.na(gammaj)) > 0) break
    if(sum(1-gammaj)==0) break
    fn<-sum(unlist(mclapply(1:MM, function(j){
      (1-gammaj[j])*sum(deltajk[mis.inds[[j]],j])
    }, mc.cores = parcores)))
    fd<-sum((1-gammaj)*k_j)
    lambda<-fn/fd
    
    alpha<-sum((1-gammaj)*
                 unlist(mclapply(1:MM, function(j){
                   sum(
   (1-deltajk[mis.inds[[j]],j])*(betajk[j,mis.inds[[j]]]^2 / sjk2[j,mis.inds[[j]]])
   )
    }, mc.cores = parcores)))/
      sum((1-gammaj)*unlist(mclapply(1:MM, function(j){
        sum(1-deltajk[mis.inds[[j]],j])}, mc.cores = parcores)))
    
    nllk <- MakeADFun ( data = list ( b2s2=b2s2, bs22=bs22, os22=os22, gammaj=gammaj) , 
                        parameters = list ( logTau2=log(tau2)) , 
			DLL="tau2f",silent=TRUE)
    fit <- nlminb ( start = nllk $par , objective = nllk $fn , gradient = nllk $gr ,
                    lower =c(- Inf ,0) , upper =c( Inf , Inf ), control = list(trace=0))
    
    tau2<-exp(fit$par)
    tau2<-max(tau2, 1e-17)
    
    if(is.na(tau2)) break 
    
    mu.hat<-unlist(mclapply(1:MM, function(j){
      sum(betajk[j,mis.inds[[j]]]/sjk2[j,mis.inds[[j]]])/(sum(1/sjk2[j,mis.inds[[j]]]) + 1/tau2)
    }, mc.cores = parcores))
    
    p<-sum(gammaj)/MM
    if(verbose > 0){
      print("m step finished.")
      print(paste0("p=", round(p, 3), 
		   " lambda=", round(lambda, 3), 
		   " tau2=", round(tau2, 8), 
		   " alpha=", round(alpha, 3)))
    }
    
    llbR1<-unlist(mclapply(1:nrow(betajk), function(j){
	mamba:::llbR1_j(betajk_j = betajk[j,mis.inds[[j]]], 
	      sjk2_j=sjk2[j,mis.inds[[j]]], 
	      tau2inv = 1/tau2)
    }, mc.cores=parcores)) - (k_j/2)*log(2*pi)
    
    llbR0<-unlist( mclapply(1:nrow(betajk), function(j){
	    mamba:::llbR0_j(betajk_j = betajk[j,mis.inds[[j]]], 
	      sjk2_j = sjk2[j,mis.inds[[j]]], 
	      alpha = alpha, 
	      lambda=lambda)
    }, mc.cores = parcores))
    
    
    ll[iter]<-sum(unlist(mclapply(1:MM, function(j){
	mamba:::logsumexp(c(log(p) + llbR1[j], log(1-p) + llbR0[j]))
    }, mc.cores = parcores)))
    
    if(iter > 1){
      llMat<-rbindlist(list(llMat,
                            data.table(p=p,lambda=lambda,tau2=tau2,alpha=alpha,ll=ll[iter])))
    } else{
      llMat<-data.table(p=p,lambda=lambda,tau2=tau2,alpha=alpha,ll=ll[iter])
    } 
   
   # convergence criteria # 
    if(iter >=2){
      if(verbose > 0){
        print(paste0("relative ll increase: ", (ll[iter]-ll[iter-1])/ll[iter]))
        print(paste0("ll increase: ", (ll[iter]-ll[iter-1])))
      }
      if(!is.na(ll[iter] - ll[iter-1])){
        if(((ll[iter] - ll[iter-1])/ll[iter] < rel.eps && ll[iter] - ll[iter-1] < 0.01) || 
		((ll[iter]-ll[iter-1]) < 0.001 && iter > minIter) || 
		((ll[iter] - ll[iter-1])/ll[iter] < rel.eps && iter > minIter)) break
      }
      #if(ll[iter] - ll[iter-1] < 0) break
    }
    if(verbose > 0){
      print(paste0("iteration ", iter, " complete."))
    } 
    #print(llMat[1:iter])
  }
  end<-Sys.time()
  se.hat<-unlist(mclapply(1:nrow(betajk), function(j){
    sqrt(1/(sum(1/sjk2[j,mis.inds[[j]]]) + 1/tau2))
  }, mc.cores = parcores))
  outliermat<-data.table::data.table(snp=1:ncol(deltajk),
                         t(deltajk))
  outliermat<-outliermat[,lapply(.SD, function(x) 1-x),.SDcols=paste0("V", 1:nrow(deltajk)),by=snp]
  colnames(outliermat)[2:ncol(outliermat)]<-paste0("Oj_",1:dim(deltajk)[1])
  outliermat[,ppr:=gammaj]
  #outlierprobs<-outliermat[,#(paste0("outlier_prob", 1:dim(deltajk)[1])):=
  #                         lapply(.SD, function(x){(1-gammaj)*(1-x)}),
  #                         .SDcols=paste0("deltaj_",1:dim(deltajk)[1]), by=.(snp)]
  #colnames(outlierprobs)[2:ncol(outlierprobs)]<-paste0("outlier_prob", 1:dim(deltajk)[1])
  # if(!is.null(colnames(betajk))){
  #   dnames<-gsub("beta.*\\_","",colnames(betajk))
  #   
  # }
  if(!is.na(snpids) && length(snpids)==nrow(outliermat)) {
    outliermat[,snp:=snpids]
  #  outlierprobs[,snp:=snpids]
  }
  return(list(ll=ll,
              p=p,
              ppr=gammaj,
              alpha=alpha,
              lambda=lambda,
              tau2=tau2, 
              mu.hat=mu.hat, 
              se.hat=se.hat,
              post.means=mu.hat*gammaj,
              outliermat=outliermat,
              #outlierprobs=outlierprobs, 
              time=end-strt,
              param_est_log=llMat[,`:=`(eps=ll-shift(ll,1),
                                rel.eps=((ll-shift(ll,1))/ll))]))
}


#s2<-sjk2[,-c("chr", "bp", "ref", "alt", "snp"),with=F]
#total_null_scores<-10^6
#out<-"mamba_pvals/chr14"
#system("mkdir mamba_pvals/")

#s2<-sjk2
#total_null_scores<-10^6
# model<-mod
get_null_scores<-function(model, 
		   s2, 
		   total_null_scores, 
                   out,
		   parcores=1,
                   seed=NULL,
		   save_all=FALSE,
                   clean_any_existing_scores=FALSE,
		   nullSNPsPerModel=NULL 
                   ){
  
  if(missing(seed)) seed<-sample.int(10^6,1)
  set.seed(seed)
 
  if(clean_any_existing_scores){
    outd<-strsplit(out, "\\/")[[1]]
    slsh<-ifelse(grepl("\\/$", out), "/", "")
    pref<-outd[length(outd)]
    outdir<-strsplit(out, pref)[[1]][1]
    rmfiles<-grep(paste0(pref, slsh, ".nullscores.*"), list.files(outdir),value=T)
    if(length(rmfiles) > 0){
     for(f in rmfiles){
        system(paste0("rm ", outdir, f))
      } 
    }
  }


  if(missing(nullSNPsPerModel)){
    nullSNPsPerModel<-ceiling(nrow(s2)*(1-model$p))
  } 

  system(paste0("touch ", out, ".nullscores.txt")) 

  total_score_count<-as.numeric(system(paste0("wc -l ", out, ".nullscores.txt| awk '{print $1}'"),
                                       intern=T))

  print(paste0(total_score_count, " scores already generated in ", out, ".nullscores.txt."))

  stopifnot(total_score_count < total_null_scores) 
    
  nModels<- ceiling( (total_null_scores-total_score_count) / nullSNPsPerModel)

  pdat<-nullMod<-list()
  print(paste0("Fitting ", nModels, 
               " models to generate approximately ", total_null_scores ," non-replicable SNPs."))


  stop=FALSE
  for(j in 1:nModels){
    pdat[[j]]<-mamba:::generate_data_s2(model, s2, Mnull=nullSNPsPerModel)
    nullMod[[j]]<-mamba::mamba(pdat[[j]]$betajk, 
                        pdat[[j]]$s2_sample, 
                        parcores,
                        p=model$p, 
                        lambda=model$lambda, 
                        alpha=model$alpha, 
                        tau2=model$tau2,
                        verbose =0)
    nullscoresj<-nullMod[[j]]$ppr
    nullRij<-pdat[[j]]$Rj
    nullscoresj<-nullscoresj[nullRij==0] 
    fwrite(data.table(nullscoresj),
           file=paste0(out, ".nullscores.",pdat[[j]]$seed,".txt"),
           col.names = FALSE)
   
    system(paste0("cat ",
		  paste0(out, ".nullscores.",pdat[[j]]$seed,".txt")," >> ",
		  paste0(out, ".nullscores.txt")))
    system(paste0(" echo $(wc -l ", 
		  paste0(out, ".nullscores.txt) seed ",pdat[[j]]$seed, " >> ",
			 paste0(out, ".nullscores.log"))))
    
    system(paste0("rm ",paste0(out, ".nullscores.",pdat[[j]]$seed,".txt")))
    
    print(paste0("Model ",j, " of ", nModels, " complete.")) 

    total_score_count<-as.numeric(system(paste0("wc -l ", out, ".nullscores.txt| awk '{print $1}'"),intern=T))

    if(as.numeric(total_score_count) > total_null_scores ) {
      print(paste0(total_null_scores, " total_null_scores generated."))
      stop=TRUE
     }
     if(stop) break
  }
  
  if(save_all){
  
  nullscores<-unlist(lapply(nullMod, "[[", "ppr"))
  nullRi<-unlist(lapply(pdat, "[[", "Rj"))
  nullscores<-nullscores[nullRi==0]
 
   return(list(nullscores=nullscores,
                nullRi=nullRi,
                pdat=pdat,
                nullMod=nullMod,
                seed=seed))
  } else{
 
   nullscores<-fread(paste0(out, ".nullscores.txt"),header=F)[,V1]
  
    return(list(nullscores=nullscores,
                seed=seed))
  }
}


## Mnull<-ceiling(nrow(s2)*(1-model$p))
## s2<-as.matrix(sjk2[,-c("chr", "bp", "ref", "alt","snp"),with=F])
## model<-mod
generate_data_s2<-function(model, s2, parcores=1, seed=NULL, Mnull=NULL){

  if(missing(Mnull)){
    Mnull<-ceiling(nrow(s2)*(1-model$p))
  }
  if(missing(seed)){
   seed<-sample.int(10^6,1)
  }
 
  s2<-as.matrix(s2) 
  zeroL<-mclapply(1:nrow(s2), function(j){
    which(s2[j,]==0)
  }, mc.cores = parcores)
  chk<- which(sapply(zeroL, length) > 0)
  if(length(chk) > 0){
    for(j in chk){
      s2[j,zeroL[[j]]]<-NA
    }
  }
  infL<-mclapply(1:nrow(s2), function(j){
    which(is.infinite(s2[j,]))
  }, mc.cores = parcores)
  chk<- which(sapply(infL, length) > 0)
  if(length(chk) > 0){
    for(j in chk){
      s2[j,infL[[j]]]<-NA
    }
  }

  p<-model$p
  lambda<-model$lambda
  tau2<-model$tau2
  alpha<-model$alpha
  

  M<-ceiling(Mnull/(1-p))
  sample_inds<-sample(nrow(s2),M,replace = TRUE)
  s2_sample<-s2[sample_inds,]
  mis.inds<-lapply(1:nrow(s2_sample), function(j){
    which(!is.na(s2_sample[j,]))
  })
  Rj<-rbinom(M, size=1, prob=p)
  mu<-Rj*rnorm(M, 0, sqrt(tau2))
  k_j<-sapply(1:nrow(s2_sample), function(j){sum(!is.na(s2_sample[j,]))})
  sj2<-lapply(1:M, function(j){s2_sample[j,]})
  Ojk<-lapply(1:M, function(j){
    vec<-rep(NA, dim(s2)[2])
    vec[mis.inds[[j]]]<-rbinom(k_j[j], size=1, prob=lambda)
    vec
  })
  
  betaj<-lapply(1:M, function(j){
    vec<-rep(NA, dim(s2)[2])
    vec[mis.inds[[j]]]<-
      Rj[j] * rnorm(k_j[j], mu[j], sd=sqrt(sj2[[j]][mis.inds[[j]]])) + 
      (1-Rj[j]) * ((Ojk[[j]][mis.inds[[j]]])*rnorm(k_j[j], 0, sqrt(sj2[[j]][mis.inds[[j]]])) + 
                     (1-Ojk[[j]][mis.inds[[j]]])*rnorm(k_j[j], 0, sqrt(alpha)*sqrt(sj2[[j]][mis.inds[[j]]])))
    
    vec
  })
  
  meta.z<-sapply(1:length(betaj), function(j){
    sum(betaj[[j]][mis.inds[[j]]]/sj2[[j]][mis.inds[[j]]]) / sqrt(sum(1/sj2[[j]][mis.inds[[j]]]))
  })
  betajk<-matrix(unlist(betaj), ncol=dim(s2)[2], byrow=TRUE)
  return(list(betajk=betajk,
              s2_sample=s2_sample,
              sample_inds=sample_inds,
              meta.z=meta.z, 
              Rj=Rj, 
              Ojk=Ojk,
              muj=mu,
              seed=seed))
}
dan11mcguire/mamba documentation built on Nov. 10, 2020, 12:37 a.m.