knitr::opts_chunk$set( collapse = TRUE,comment = "#>", echo = TRUE,cache = TRUE, dev = "png")
\def\T{\text{T}} \newcommand{\bf}[1]{\mathbf{#1}} \newcommand{\bigPar}[1]{\left(#1\right)} \newcommand{\bigCur}[1]{\left{#1\right}} \newcommand{\bcSqu}[2]{\left[#1 \middle| #2\right]} \newcommand{\cE}[2]{E\bcSqu{#1}{#2}} \newcommand{\nexp}[1]{\exp\bigCur{#1}} \newcommand{\ind}[1]{1\bigCur{#1}} \newcommand{\w}[1]{\widehat{#1}}
Assuming all software dependencies and CSeQTL [@little2023computational] installation are installed, we can begin.
# Load libraries req_packs = c("ggplot2","smarter","CSeQTL") for(pack in req_packs){ library(package = pack,character.only = TRUE) } # List package's exported functions ls("package:CSeQTL") # Fix seed set.seed(2)
We will simulate a dataset with three cell types, reference allele differential expression, and no cell type-specific eQTLs.
# sample size NN = 3e2 # fold-change between q-th and 1st cell type true_KAPPA = c(1,3,1) # eQTL effect size per cell type, # fold change between B and A allele true_ETA = c(1,1,1) # cis/trans effect size true_ALPHA = c(1,1,1) # count number of cell types QQ = length(true_KAPPA) # TReC model overdispersion true_PHI = 0.1 # ASReC model overdispersion true_PSI = 0.05 # Simulate cell type proportions tRHO = gen_true_RHO(wRHO = 1,NN = NN,QQ = QQ) plot_RHO(RHO = tRHO) boxplot(tRHO,xlab = "Cell Type",ylab = "Proportion") # Simulate a data object for a gene and SNP sim = CSeQTL_dataGen(NN = NN,MAF = 0.3,true_BETA0 = log(1000), true_KAPPA = true_KAPPA,true_ETA = true_ETA,true_PHI = true_PHI, true_PSI = true_PSI,prob_phased = 0.05,true_ALPHA = true_ALPHA, RHO = tRHO,cnfSNP = TRUE,show = FALSE) names(sim) # TReC, ASReC, haplotype 2 counts sim$dat[1:3,] # Batch covariates including the intercept sim$XX[1:3,] sim$dat$SNP = sim$true_SNP sim$dat$SNP = factor(sim$dat$SNP, levels = sort(unique(sim$dat$SNP)), labels = c("AA","AB","BA","BB")) # TReC vs SNP ggplot(data = sim$dat, mapping = aes(x = SNP,y = log10(total + 1))) + geom_violin(aes(fill = SNP)) + geom_jitter(width = 0.25) + geom_boxplot(width = 0.1) + xlab("Phased Genotype") + ylab("log10(TReC + 1)") + theme(legend.position = "right", axis.title = element_text(size = 15), axis.text = element_text(size = 12)) # TReC vs ASReC ggplot(data = sim$dat, mapping = aes(x = total,y = total_phased,color = SNP)) + geom_point() + xlab("Total Read Count (TReC)") + ylab("Total Allele-specific Read Count (ASReC)") + theme(legend.position = "right", axis.title = element_text(size = 15), axis.text = element_text(size = 12))
Based on the above TReC boxplot, the phased SNP genotype appears to be an eQTL. However, this has yet to account for batch effects. Let us test for a bulk eQTL accounting for batch covariates sim$XX. Make sure to center continuous covariates. Notice that CSeQTL, treating bulk TReC as sourced from a single cell type, corresponds to the TReCASE [@sun2012statistical] method.
summary(sim$XX) mRHO = matrix(1,NN,1) colnames(mRHO) = "Bulk" cistrans_thres = 0.01 res = c() for(trec_only in c(TRUE,FALSE)){ for(neg_binom in c(TRUE,FALSE)){ for(beta_binom in c(TRUE,FALSE)){ cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n", date(),trec_only,neg_binom,beta_binom)) PHASE = sim$dat$phased * ifelse(trec_only,0,1) upPHI = ifelse(neg_binom,1,0) upPSI = ifelse(beta_binom,1,0) sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2, ASREC = sim$dat$total_phased,PHASE = PHASE, SNP = sim$true_SNP,RHO = mRHO,XX = sim$XX,upPHI = upPHI, upKAPPA = 1,upETA = 1,upPSI = upPSI,upALPHA = 1, iFullModel = FALSE,trim = FALSE,thres_TRIM = 10, hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5, numAS_het = 5,cistrans_thres = cistrans_thres) # sout$HYPO res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"), TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"), ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"), sout$res)) rm(sout) }}} num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric") res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2)) res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){ ct = xx[1] pval = as.numeric(xx[2]) ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL") }) res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){ chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" ) | ( true_PHI == 0 & xx[1] == "Poisson" )) chk_trec chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" ) | ( true_PSI == 0 & xx[2] == "Binomial" )) chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No") chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No") sprintf("%s;%s",chk_trec,chk_asrec) }) # Simplified output smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans"))
Thus a bulk approach to eQTL testing without accounting for cell type variation leads to a false positive association. Model misspecification can also lead to inflated type 1 error.
The code below adjusts for cell type proportions and tests for cell type-specific eQTLs.
cistrans_thres = 0.01 res = c() for(trec_only in c(TRUE,FALSE)){ for(neg_binom in c(TRUE,FALSE)){ for(beta_binom in c(TRUE,FALSE)){ cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n", date(),trec_only,neg_binom,beta_binom)) PHASE = sim$dat$phased * ifelse(trec_only,0,1) upPHI = ifelse(neg_binom,1,0) upPSI = ifelse(beta_binom,1,0) upKAPPA = rep(1,QQ) upETA = upKAPPA upALPHA = upETA sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2, ASREC = sim$dat$total_phased,PHASE = PHASE, SNP = sim$true_SNP,RHO = sim$true_RHO,XX = sim$XX,upPHI = upPHI, upKAPPA = upKAPPA,upETA = upETA,upPSI = upPSI,upALPHA = upALPHA, iFullModel = FALSE,trim = FALSE,thres_TRIM = 10, hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5, numAS_het = 5,cistrans_thres = cistrans_thres) # sout$HYPO res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"), TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"), ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"), sout$res)) rm(sout) }}} num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric") res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2)) res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){ ct = xx[1] pval = as.numeric(xx[2]) ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL") }) res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){ chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" ) | ( true_PHI == 0 & xx[1] == "Poisson" )) chk_trec chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" ) | ( true_PSI == 0 & xx[2] == "Binomial" )) chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No") chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No") sprintf("%s;%s",chk_trec,chk_asrec) }) # Simplified output smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans"))
From above, we see that modeling cell types through proportion and reference allele expression and correctly specifying the distribution of TReC and ASReC leads to no association or no cell type specific eQTLs.
We benchmark CSeQTL against OLS, an ordinary least squares model. This approach corresponds to Matrix eQTL [@shabalin2012matrix].
For a bulk eQTL, the model adjusts for genotype and confounders. The code below fits and tests for a bulk eQTL using the simulated dataset.
ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX, RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = TRUE) names(ols_out) # Model estimates summary(ols_out$lm_out) # Effect sizes and hypothesis test ols_out$out_df
Similar to CSeQTL's bulk testing, we have a false positive bulk eQTL.
The code below will test for cell type-specific eQTLs with OLS.
ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX, RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = FALSE) names(ols_out) # Model estimates summary(ols_out$lm_out) # Effect sizes and hypothesis tests ols_out$out_df
sessionInfo()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.