R/main_geno.R

Defines functions SimGenoPower

Documented in SimGenoPower

#' Power Inference for GWAS Discovery in Admixed Populations
#'
#' Infers statistical power for discovery of variants in a 2-way admixed group, and the corresponding populations of ancestry origin.
#' @param n_snps Total number of SNPs (causal and non-causal). SNPs are assumed independent from each other. Default is 1000.
#' @param popsizes A vector of sample sizes for the admixed group, Pop 1 (ancestral origin from which the major component of ancestry comes), Pop 2 (ancestral origin from which the minor component of ancestry comes). Default is 1000 each.
#' @param mean_ganc Mean global ancestry from Pop 1 in the admixed group. Default is 0.75.
#' @param sd_ganc Standard deviation of the global ancestry distribution from Pop 1 in the admixed group. Default is 0.1.
#' @param prop_causal Proportion of causal variant among total SNPs. Default is 0.1.
#' @param hg2 Heritability of the trait. Default is 0.5.
#' @param fst The baseline fixation index FST among causal variants between Pop 1 and Pop2. Default is 0.2.
#' @param fst_constant True or False: Is FST among causal variants a constant value equaling the baseline (True) or have an increment with the rarity of ancestral minor allele frequency (False)? Default is True.
#' @param bg_fst Genomic background Fst value between 0 and 1 for non-causal variants. Default 'FALSE' to be set as the same as \code{fst}. If set to a value here, Fst of non-causal variants will be under this constant value.
#' @param freqdist Frequency distribution of ancestral variants (prior to the divergence of Pop 1 and 2). "unif" for a uniform distribution modeling (default), "inverse" for an inverse distribution modeling.
#' @param liability A vector of liability threshold for dichotomous traits in the admixed group, Pop 1, Pop2, respectively. If set as NA (default), the trait is assumed to be quantitative.
#' @param aaf Limits (min, max) on the frequency distribution of ancestral variants. Default is c(0.001, 0.999).
#' @param envanc Options to model environment by ancestry interactions: 0 means no interaction. 1 means interation is modeled as the sum of Gaussian sampling. 2 means the interaction is modeled as linear dependence on the ancestry proportion. Default is 1.
#' @param envvar percentage of the environment by ancestry interaction over the total phenotypic variance. Only effective when \code{envanc} is 2.
#' @param alpha Type I error rate. Default is 0.05.
#' @return A list of powers and the re-estimated heritability from simulation.\tabular{ll}{
#' \code{admixed_power} \tab Power for discovery in the admixed group\cr
#' \tab \cr
#' \code{pop1_power} \tab Power for discovery in Pop 1\cr
#' \tab \cr
#' \code{pop2_power} \tab Power for discovery in Pop 2\cr
#' \tab \cr
#' \code{h2} \tab Estimated heritability from simulated phenotypes and genotypes \cr
#' \tab \cr}
#'
#' @export
# this is the main function for genotype mediated simulation
SimGenoPower <-function(n_snps=1000,
                        popsizes=c(1e3, 1e3, 1e3),
                        mean_ganc=0.75,
                        sd_ganc=0.1,
                        prop_causal=0.1,
                        hg2=0.5,
                        fst=0.2,
                        fst_constant=T,
                        bg_fst=F,
                        freqdist="unif",
                        liability=NA,
                        aaf=c(1e-3, 0.999),
                        envanc=1,
                        envvar=0,
                        alpha=0.05){

  ptm <- proc.time()

  ###########################
  #       check input       #
  ###########################

  n.pop <- try(length(popsizes),silent=T)
  if(n.pop != 3) stop("Need sample size of two ancestral populations and the admixed population." )
  if (freqdist!="unif" & freqdist!="inverse") stop("Ancestral frequencies are drawn from either uniform distribution or inverse distribution.")
  if (!is.na(liability[1]) & sum(liability>0 & liability<1)!=3) stop("Liability for 3 populations should be a vector of 3 values (adx, pop1, pop2) between 0 and 1.
                                                                   Leave it as NA if for quantitative traits.")
  if (!envanc%in%c(0,1,2)) stop("Specify if environment by ancestry is modeled. 0: no. 1: gaussian. 2: linear.")
  if (envanc==2 & !(envvar>=0 & envvar<1)) stop("Specify the proportion of phenotypic variance explained by environment by ancestry (linear).")
  if (bg_fst!=F & !(bg_fst>=0 & bg_fst<=1)) stop("Background Fst can either be a value between 0 and 1, or turned off as 'FALSE' to be the same as causal variants")
  ######################################
  #     draw ancestry and genotypes    #
  ######################################
  print("Simulating ancestry and genotypes.")

  pop1_idvs <- popsizes[1]
  pop2_idvs <- popsizes[2]
  adx_idvs <- popsizes[3]

  #global
  ganc <- GAncBeta(mean_ganc, sd_ganc^2, adx_idvs)

  #local ancestry per marker: 0/1/2 copies of pop1 ancestry
  lanc_p <- matrix(rbinom(adx_idvs*n_snps, 1, ganc), nrow=adx_idvs, ncol=n_snps)
  lanc_m <- matrix(rbinom(adx_idvs*n_snps, 1, ganc), nrow=adx_idvs, ncol=n_snps)
  lanc_pop1 <- lanc_p + lanc_m

  #set freq per ancestral population at each locus, given fst and human ancestral allele maf
  pop1.frqs <- rep(NA, n_snps)
  pop2.frqs <- rep(NA, n_snps)
  causal_index <- sample(c(1:n_snps), size=round(n_snps*prop_causal), replace=F)
  for (s in 1:n_snps) {
    # set allele frequencies for ancestries
    p.pop1 <- 0
    p.pop2 <- 0
    if(freqdist=="unif"){
      anc.frq <- runif(1, min=aaf[1], max=aaf[2])
    }else if(freqdist=="inverse"){
      anc.frq <- rinverse(1, minMAF=1e-3)
    }

    anc.maf <- 0.5 - abs(0.5-anc.frq)

    if(fst_constant){
      Fst <- fst
    }else{
      Fst <- fst + (1-anc.maf)*0.3 # the smaller maf, the bigger fst
    }

    if(bg_fst==F) bg_fst <- Fst # if causal and non-causal have the same fst
    fst_snp <- c(bg_fst, Fst)[as.numeric(s%in%causal_index)+1] # Fst of this SNP
    
    #Balding-Nichol model
    while((p.pop1 <= 0 || p.pop1 >= 1) || p.pop2 <= 0 || p.pop2 >= 1){
      p.pop1 <- rbeta(1, anc.frq*(1-fst_snp)/fst_snp,(1-anc.frq)*(1-fst_snp)/fst_snp)
      p.pop2 <- rbeta(1, anc.frq*(1-fst_snp)/fst_snp,(1-anc.frq)*(1-fst_snp)/fst_snp)
    }
    pop1.frqs[s] <- p.pop1
    pop2.frqs[s] <- p.pop2
  }


  # set genotypes based on frequency
  #1. pop1 and pop2
  #pop1_genos <- matrix(rbinom(n_snps*pop1_idvs, 1, pop1.frqs), byrow=T, nrow=pop1_idvs, ncol=n_snps) +
  #  matrix(rbinom(n_snps*pop1_idvs, 1, pop1.frqs), byrow=T, nrow=adx_idvs, ncol=n_snps)
  pop1_genos <- matrix(rbinom(n_snps*pop1_idvs, 1, pop1.frqs), byrow=T, nrow=pop1_idvs, ncol=n_snps) +
    matrix(rbinom(n_snps*pop1_idvs, 1, pop1.frqs), byrow=T, nrow=pop1_idvs, ncol=n_snps)
  
  pop2_genos <- matrix(rbinom(n_snps*pop2_idvs, 1, pop2.frqs), byrow=T, nrow=pop2_idvs, ncol=n_snps) +
    matrix(rbinom(n_snps*pop2_idvs, 1, pop2.frqs), byrow=T, nrow=pop2_idvs, ncol=n_snps)

  #2. admixed population
  adx_genos <- t(sapply(c(1:adx_idvs), FUN=LAnc2Geno, lanc_p=lanc_p,lanc_m=lanc_m, frq_pop1=pop1.frqs, frq_pop2=pop2.frqs))
  adx.frqs <- apply(adx_genos, MARGIN=2, FUN=af)

  ##############################################
  #  simulating phenotype based on prs and h2  #
  ##############################################
  print("Simulating phenotypes.")

  # effect size is tied to fst+ancestral maf
  # only causal variants are assigned with weights; assuming trait is inverse normalized
  #weights <- rbinom(n_snps,1,prop_causal) * rnorm(n_snps, 0, 1) * ((rbinom(n_snps, 1, pop1.frqs/(pop1.frqs+pop2.frqs))-0.5)*2)
  
  if_causal <- rep(0, n_snps)
  if_causal[causal_index]<-1
  weights <- if_causal * rnorm(n_snps, 0, 1) * ((rbinom(n_snps, 1, pop1.frqs/(pop1.frqs+pop2.frqs))-0.5)*2)

  adx_prs <- apply(adx_genos,MARGIN=1,FUN=indiv_prs,weights=weights)
  pop1_prs <- apply(pop1_genos, MARGIN=1,FUN=indiv_prs,weights=weights)
  pop2_prs <- apply(pop2_genos, MARGIN=1,FUN=indiv_prs,weights=weights)

  # add environmental noise # including gaussian and ancestry associated
  pop1_phenos <- pop1_prs + rnorm(pop1_idvs, 0, 1) * sqrt(var(pop1_prs)*(1/hg2-1))
  pop2_phenos <- pop2_prs + rnorm(pop2_idvs, 0, 1) * sqrt(var(pop2_prs)*(1/hg2-1))
  if(envanc==1){# including gaussian and ancestry associated
    # make sure the h2 is not changed
    #scaling <- sqrt(var(adx_prs) / (var(adx_prs) + var(pop1_prs)*ganc^2 + var(pop2_prs)*((1-ganc)^2)))
    m_ganc <- mean(ganc)
    v_ganc <- var(ganc)
    scaling <- sqrt(var(adx_prs)/(var(adx_prs) + var(pop1_prs)*(m_ganc^2 + v_ganc) + var(pop2_prs)*(1-2*m_ganc + m_ganc^2 + v_ganc)))
    adx_phenos <- adx_prs +
      (rnorm(adx_idvs, 0, 1) * sqrt(var(adx_prs)*(1/hg2-1)) +
       rnorm(pop1_idvs, 0, 1) * sqrt(var(pop1_prs)*(1/hg2-1))*ganc +
       rnorm(pop2_idvs, 0, 1) * sqrt(var(pop2_prs)*(1/hg2-1))*(1-ganc))*scaling
  }else if(envanc==0){
    adx_phenos <- adx_prs + rnorm(adx_idvs, 0, 1) * sqrt(var(adx_prs)*(1/hg2-1))
  }else if(envanc==2){
    scaling <- sqrt((1-(hg2+envvar))/(1-hg2))
    if(scaling==0){# when p = 1-hg2; the original equation doesn't stand
      beta_theta <- sqrt(var(adx_prs)*(1-hg2)/(var(ganc)*hg2))
      adx_phenos <- adx_prs + beta_theta*ganc
    }else{
      beta_theta <- sqrt((envvar*(1-hg2)*var(adx_prs))/(hg2*(1-(hg2+envvar))*var(ganc)))
      adx_phenos <- adx_prs + scaling * (rnorm(adx_idvs, 0, 1) * sqrt(var(adx_prs)*(1/hg2-1)) +
                                           beta_theta*ganc)
    }
  }
  h2 <- var(adx_prs)/var(adx_phenos)

  # if the trait is dichotomous, converting from
  if(!is.na(liability[1])){
    adx_phenos <- q2b(adx_phenos, liability[1])
    pop1_phenos <- q2b(pop1_phenos, liability[2])
    pop2_phenos <- q2b(pop2_phenos, liability[3])
  }


  #############################################
  #  perform correlations on causal variants  #
  #############################################

  p_adx <-c()
  p_1<-c()
  p_2<-c()
  index_causal <- which(weights!=0)

  pb<- progress::progress_bar$new(format="Performing associations [:bar] :percent", total=length(index_causal),clear=F, width=80)
  for(i in index_causal){

    if(!is.na(liability[1])){#dichotomous

      if(var(adx_genos[,i])!=0){
        glm_temp <- summary(glm(adx_phenos~ganc + adx_genos[,i], family="binomial"))
        p_adx <- c(p_adx, glm_temp$coefficients[3,4])
      }else{
          p_adx <- c(p_adx, 1)
      }


      if(var(pop1_genos[,i])!=0){
          glm_temp <-summary(glm(pop1_phenos~ pop1_genos[,i], family="binomial"))
          p_1 <- c(p_1, glm_temp$coefficients[2, 4])
      }else{
          p_1 <- c(p_1, 1)
      }


      if(var(pop2_genos[,i])!=0){
        glm_temp <-summary(glm(pop2_phenos~ pop2_genos[,i], family="binomial"))
        p_2 <- c(p_2, glm_temp$coefficients[2, 4])
      }else{
        p_2 <- c(p_2, 1)
      }

    }else{# quantitative
      if(var(adx_genos[,i])==0){
        p_adx <- c(p_adx, 1)
      }else{
        lm_temp <- anova(lm(adx_phenos~ganc + adx_genos[,i]))
        p_adx <- c(p_adx, lm_temp$`Pr(>F)`[2])
      }

      if(var(pop1_genos[,i])==0){
        p_1<- c(p_1,1)
      }else{
        lm_temp <-anova(lm(pop1_phenos~ pop1_genos[,i]))
        p_1 <- c(p_1, lm_temp$`Pr(>F)`[1])
      }

      if(var(pop2_genos[,i])==0){
        p_2<- c(p_2,1)
      }else{
        lm_temp <-anova(lm(pop2_phenos~ pop2_genos[,i]))
        p_2 <- c(p_2, lm_temp$`Pr(>F)`[1])
      }
    }
    pb$tick()
  }

  admixed_power <- powers(p_adx, alpha)
  pop1_power <- powers(p_1, alpha)
  pop2_power <- powers(p_2, alpha)

  print(paste("Elapsed time: ", as.numeric((proc.time() - ptm)[3]), "s", sep=""))
  return(list(admixed_power=admixed_power, pop1_power=pop1_power, pop2_power=pop2_power, h2_adx=h2))

}
menglin44/APRICOT documentation built on July 1, 2023, 4:01 p.m.