R/simulate.R

Defines functions simulate_study cov_beta simulate_beta vcf2snpmatrix mvs_sigma mvs_perm

Documented in cov_beta mvs_perm mvs_sigma simulate_beta simulate_study vcf2snpmatrix

library(snpStats)
library(corpcor)
library(Matrix)
library(mvtnorm)

#' Code to sample multivariate norm
#' \code{mvs_perm} sample from a multivariate normal distribution
#'
#' @param m a vector -  mean values
#' @param sigma a matrix  - a positive definite covariance matrix
#' @param n a scalar - number of samples to generate (default 2)
#' @return a matrix of sampled values

mvs_perm<-function(m,sigma,n=2){
  if(!is.matrix(sigma))
    stop("sigma parameter is not a matrix")
  if(!corpcor::is.positive.definite(sigma,,method="chol"))
    stop("sigma is not positive definite")
  rd<-mvtnorm::rmvnorm(n,mean=m,sigma=sigma,method="chol")
  t(rd)
}

#' Code to compute sigma - genotype covariance matrix
#' \code{mvs_sigma} create a geneotype convariance matrix
#'
#' @param r a matrix of pairwise \eqn{r^2} measures generated by snpStats
#' @param quiet boolean whether to return warnings messages default=TRUE
#' @return a positive definite approximation of covariance matrix

mvs_sigma<-function(r,quiet=TRUE){
  diag(r)<-1
  if(any(is.na(r)))
    if(!quiet)
      message(sprintf("Found %s where R^2 is NA",sum(is.na(r))))
    r[is.na(r)]<-0
  return(methods::as(corpcor::make.positive.definite(r),"Matrix"))
}

#' convert a vcf file to snpMatrix object
#' \code{vcf2snpmatrix} convert a vcf file to snpMatrix object
#'
#' @param vcf a scalar - path to vcf file
#' @param bcft a scalar - path to bcftools binary
#' @param region_file a scalar - path to a file satisfying -R bcftools criteria (optional)
#' @param quiet a boolean - if set to false then debug information shown (default = FALSE)
#' @return a list with two slots. sm is the snpMatrix object and info is a data.table describes SNPs
#' @export

vcf2snpmatrix <- function(vcf,bcft,region_file,quiet=TRUE){
  CHROM <- POS <- pid <- NULL
  header_cmd <- sprintf("%s view -h %s",bcft,vcf)
  if(!quiet)
    message(header_cmd)
  my.pipe<-pipe(header_cmd)
  header<-utils::tail(scan(my.pipe,what=character(),sep="\n",quiet=TRUE),n=1)
  close(my.pipe)
  cnames<-unlist(strsplit(header,"\t"))
  if(!missing(region_file)){
    vcftools_cmd<-sprintf("%s view -R %s -O v %s | grep -v '^#'",bcft,region_file ,vcf)
  }else{
    vcftools_cmd<-sprintf("%s view -O v %s | grep -v '^#'",bcft,vcf)
  }
  if(!quiet)
    message(vcftools_cmd)
  tmp<-fread(vcftools_cmd)
  setnames(tmp,cnames)
  gt<-tmp[,10:ncol(tmp),with=FALSE]
  if(nrow(gt)==0)
    return(NA)
  info<-tmp[,1:9,with=FALSE]
  setnames(info,'#CHROM','CHROM')
  if(!quiet)
    message("Creating snpMatrix obj")
  sm<-apply(gt,1,function(x) sub("0\\|0","1",x))
  sm<-apply(sm,1,function(x) sub("(0\\|1)|(1\\|0)","2",x))
  sm<-apply(sm,1,function(x) sub("1\\|1","3",x))
  ## set anything else to a missing value
  sm<-t(apply(sm,1,function(x) as.raw(sub("[0-9]\\|[0-9]","0",x))))
  info[,pid:=paste(CHROM,POS,sep=':')]
  colnames(sm)<-info$pid
  rownames(sm)<-colnames(gt)
  return(list(sm=methods::new("SnpMatrix", sm),info=info))
}

#' simulate betas for an ld block
#' \code{simulate_beta} use the multivariate normal to simulate realistic betas
#'
#' @param sm a snpMatrix object
#' @param lor a vector - observed betas
#' @param se_lor a vector - standard error of betas
#' @param lor_shrink a vector - shrinkage values to use to adjust betas (default 1 no shrinkage)
#' @param n_sims a scalar - number of simulations to conduct
#' @return a matrix of simulated betas

simulate_beta <- function(sm,lor,se_lor,lor_shrink=1,n_sims){
  beta_hat<-lor * lor_shrink
  if(length(lor)==1)
    return(t(stats::rnorm(n_sims,mean=beta_hat,sd=se_lor)))
  # compute R statistic
  r<-snpStats::ld(sm,sm,stats="R")
  # compute closest pos-def covariance matrix
  r<-as.matrix(mvs_sigma(Matrix::Matrix(r)))
  ## for beta the covariance matrix is estimates by sigma x SE * SE^T
  cov_se<-tcrossprod(se_lor)
  cov.beta<-cov_se * r
  ## simulate beta
  return(mvs_perm(beta_hat,cov.beta,n=n_sims))
}

#' covariance matrix of betas for an ld block
#' \code{cov_beta}
#'
#' @param sm a snpMatrix object
#' @param se_lor a vector - standard error of betas
#' @return a covariance matrix for beta

cov_beta <- function(sm,se_lor){
  #beta_hat <- lor * lor_shrink
  if(length(se_lor)==1)
    return(se_lor^2)
  # compute R statistic
  #r<-ld(sm,sm,stats="R.squared")
  r<-snpStats::ld(sm,sm,stats="R")
  # compute closest pos-def covariance matrix
  r<-as.matrix(mvs_sigma(Matrix::Matrix(r)))
  ## for beta the covariance matrix is estimates by sigma x SE * SE^T
  cov_se<-tcrossprod(se_lor)
  return(cov_se * r)
}


#' simulate betas for a study
#' \code{simulate_study} use the multivariate normal to simulate realistic betas for a study
#'
#' @param DT a data.table - as returned by \code{\link{get_gwas_data}}
#' @param ref_gt_dir scalar - path to a dir of R objects named CHR_1kg.RData containing reference GT in snpMatrix format
#' @param shrink_beta scalar - Whether to use Bayesian shrinkage of betas (default = TRUE)
#' @param n_sims a scalar - number of simulations (default 10)
#' @param quiet a scalar - boolean whether to show progress messages
#' @return a DT of n_sims simulated studies for projection
#' @export

simulate_study <- function(DT,ref_gt_dir,shrink_beta=TRUE,n_sims=10,quiet=TRUE){
  pid <- value <- trait <- variable <- NULL
  s.DT <- split(DT,DT$chr)
  all.chr <- lapply(names(s.DT),function(chr){
    if(!quiet)
      message(sprintf("Processing %s",chr))
    ss.file<-file.path(ref_gt_dir,sprintf("%s.RDS",chr))
     message(file.path(ref_gt_dir,sprintf("%s.RDS",chr)))
    sm <- readRDS(ss.file)
    #sm<-get(load(ss.file))
    ## there are sometimes duplicates that we need to remove
    info <- data.table(pid=colnames(sm),order=1:ncol(sm))
    dup.idx <- which(duplicated(info$pid))
    #dup.idx<-which(duplicated(obj$info$pid))
    if(length(dup.idx)>0){
      if(!quiet)
        message(sprintf("Warning removing %d duplicated SNPs",length(dup.idx)))
      #sm$info<-sm$info[-dup.idx,]
      #sm$sm <- sm$sm[,-dup.idx]
      info <- info[-dup.idx,]
      sm <- sm[,-dup.idx]
    }
    info$order <- 1:nrow(info)
    #sm$info$order<-1:nrow(sm$info)
    # by ld block
    by.ld <- split(s.DT[[chr]],s.DT[[chr]]$ld.block)
    chr.sims <- lapply(names(by.ld),function(block){
      if(!quiet)
        message(sprintf("Processing %s",block))
      dat <- by.ld[[block]]
      setkey(dat,pid)
      #info <-sm$info[pid %in% dat$pid ,.(pid,order)]
      linfo <- info[pid %in% dat$pid ,list(pid,order)]
      setkey(linfo,pid)
      dat <- dat[linfo][order(order),]
      if(shrink_beta){
        ## compute beta shrinkage
        shrink <- with(dat,wakefield_null_pp(p.val,maf,n,n1/n))
        #M <- with(dat,simulate_beta(sm$sm[,order],log(or),emp_se,shrink,n_sims))
        M <- with(dat,simulate_beta(sm[,order],log(or),beta_se,shrink,n_sims))
      }else{
        #M <- with(dat,simulate_beta(sm$sm[,order],log(or),emp_se,1,n_sims))
        M <- with(dat,simulate_beta(sm[,order],log(or),beta_se,1,n_sims))
      }
      sims <- cbind(dat,M)
      sims <- melt(sims,id.vars=names(dat))
      return(sims[,c('or','trait'):=list(exp(value),paste(trait,variable,sep='_'))][,names(dat),with=FALSE])
    })
    chr.sims<-rbindlist(chr.sims)
  })
  all.chr<-rbindlist(all.chr)
  setkey(all.chr,pid)
}
ollyburren/cupcake documentation built on July 30, 2020, 9:58 a.m.