Introduction to CSeQTL"

knitr::opts_chunk$set(
    collapse = TRUE,comment = "#>",
    echo = TRUE,cache = TRUE,
    dev = "png")

\def\T{\text{T}} \newcommand{\bf}[1]{\mathbf{#1}} \newcommand{\bigPar}[1]{\left(#1\right)} \newcommand{\bigCur}[1]{\left{#1\right}} \newcommand{\bcSqu}[2]{\left[#1 \middle| #2\right]} \newcommand{\cE}[2]{E\bcSqu{#1}{#2}} \newcommand{\nexp}[1]{\exp\bigCur{#1}} \newcommand{\ind}[1]{1\bigCur{#1}} \newcommand{\w}[1]{\widehat{#1}}

Overview

Assuming all software dependencies and CSeQTL [@little2023computational] installation are installed, we can begin.

# Load libraries
req_packs = c("ggplot2","smarter","CSeQTL")
for(pack in req_packs){
  library(package = pack,character.only = TRUE)
}

# List package's exported functions
ls("package:CSeQTL")

# Fix seed
set.seed(2)

Simulation: No eQTL

We will simulate a dataset with three cell types, reference allele differential expression, and no cell type-specific eQTLs.

# sample size
NN = 3e2

# fold-change between q-th and 1st cell type
true_KAPPA  = c(1,3,1)

# eQTL effect size per cell type, 
#   fold change between B and A allele
true_ETA    = c(1,1,1)

# cis/trans effect size
true_ALPHA  = c(1,1,1)

# count number of cell types
QQ = length(true_KAPPA)

# TReC model overdispersion
true_PHI = 0.1

# ASReC model overdispersion
true_PSI = 0.05

# Simulate cell type proportions
tRHO = gen_true_RHO(wRHO = 1,NN = NN,QQ = QQ)
plot_RHO(RHO = tRHO)
boxplot(tRHO,xlab = "Cell Type",ylab = "Proportion")

# Simulate a data object for a gene and SNP
sim = CSeQTL_dataGen(NN = NN,MAF = 0.3,true_BETA0 = log(1000),
  true_KAPPA = true_KAPPA,true_ETA = true_ETA,true_PHI = true_PHI,
  true_PSI = true_PSI,prob_phased = 0.05,true_ALPHA = true_ALPHA,
  RHO = tRHO,cnfSNP = TRUE,show = FALSE)

names(sim)

# TReC, ASReC, haplotype 2 counts
sim$dat[1:3,]

# Batch covariates including the intercept
sim$XX[1:3,]

sim$dat$SNP = sim$true_SNP
sim$dat$SNP = factor(sim$dat$SNP,
  levels = sort(unique(sim$dat$SNP)),
  labels = c("AA","AB","BA","BB"))

# TReC vs SNP
ggplot(data = sim$dat,
  mapping = aes(x = SNP,y = log10(total + 1))) +
  geom_violin(aes(fill = SNP)) + geom_jitter(width = 0.25) +
  geom_boxplot(width = 0.1) +
  xlab("Phased Genotype") + ylab("log10(TReC + 1)") +
  theme(legend.position = "right",
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12))

# TReC vs ASReC
ggplot(data = sim$dat,
  mapping = aes(x = total,y = total_phased,color = SNP)) +
  geom_point() + xlab("Total Read Count (TReC)") + 
  ylab("Total Allele-specific Read Count (ASReC)") +
  theme(legend.position = "right",
    axis.title = element_text(size = 15),
    axis.text = element_text(size = 12))

CSeQTL

Bulk Test

Based on the above TReC boxplot, the phased SNP genotype appears to be an eQTL. However, this has yet to account for batch effects. Let us test for a bulk eQTL accounting for batch covariates sim$XX. Make sure to center continuous covariates. Notice that CSeQTL, treating bulk TReC as sourced from a single cell type, corresponds to the TReCASE [@sun2012statistical] method.

summary(sim$XX)

mRHO = matrix(1,NN,1)
colnames(mRHO) = "Bulk"
cistrans_thres = 0.01
res = c()

for(trec_only in c(TRUE,FALSE)){
for(neg_binom in c(TRUE,FALSE)){
for(beta_binom in c(TRUE,FALSE)){

  cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n",
    date(),trec_only,neg_binom,beta_binom))

  PHASE = sim$dat$phased * ifelse(trec_only,0,1)
  upPHI = ifelse(neg_binom,1,0)
  upPSI = ifelse(beta_binom,1,0)

  sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2,
    ASREC = sim$dat$total_phased,PHASE = PHASE,
    SNP = sim$true_SNP,RHO = mRHO,XX = sim$XX,upPHI = upPHI,
    upKAPPA = 1,upETA = 1,upPSI = upPSI,upALPHA = 1,
    iFullModel = FALSE,trim = FALSE,thres_TRIM = 10,
    hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5,
    numAS_het = 5,cistrans_thres = cistrans_thres)
  # sout$HYPO

  res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"),
    TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"),
    ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"),
    sout$res))
  rm(sout)

}}}

num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric")
res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2))
res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){
  ct = xx[1]
  pval = as.numeric(xx[2])
  ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL")
})
res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){
  chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" )
    | ( true_PHI == 0 & xx[1] == "Poisson" ))
  chk_trec

  chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" )
    | ( true_PSI == 0 & xx[2] == "Binomial" ))

  chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No")
  chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No")
  sprintf("%s;%s",chk_trec,chk_asrec)
})

# Simplified output
smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans"))

Thus a bulk approach to eQTL testing without accounting for cell type variation leads to a false positive association. Model misspecification can also lead to inflated type 1 error.

Cell type-specific Testing

The code below adjusts for cell type proportions and tests for cell type-specific eQTLs.

cistrans_thres = 0.01
res = c()

for(trec_only in c(TRUE,FALSE)){
for(neg_binom in c(TRUE,FALSE)){
for(beta_binom in c(TRUE,FALSE)){

  cat(sprintf("%s: trec_only = %s, neg_binom = %s, beta_binom = %s ...\n",
    date(),trec_only,neg_binom,beta_binom))

  PHASE = sim$dat$phased * ifelse(trec_only,0,1)
  upPHI = ifelse(neg_binom,1,0)
  upPSI = ifelse(beta_binom,1,0)
  upKAPPA = rep(1,QQ)
  upETA = upKAPPA
  upALPHA = upETA

  sout = CSeQTL_smart(TREC = sim$dat$total,hap2 = sim$dat$hap2,
    ASREC = sim$dat$total_phased,PHASE = PHASE,
    SNP = sim$true_SNP,RHO = sim$true_RHO,XX = sim$XX,upPHI = upPHI,
    upKAPPA = upKAPPA,upETA = upETA,upPSI = upPSI,upALPHA = upALPHA,
    iFullModel = FALSE,trim = FALSE,thres_TRIM = 10,
    hypotest = TRUE,swap = FALSE,numAS = 5,numASn = 5,
    numAS_het = 5,cistrans_thres = cistrans_thres)
  # sout$HYPO

  res = rbind(res,smart_df(Model = ifelse(trec_only,"TReC-only","TReCASE"),
    TReC_Dist = ifelse(neg_binom,"Negative Binomial","Poisson"),
    ASReC_Dist = ifelse(beta_binom,"Beta-Binomial","Binomial"),
    sout$res))
  rm(sout)

}}}

num_vars = which(unlist(lapply(res,function(xx) class(xx))) == "numeric")
res[,num_vars] = apply(res[,num_vars],2,function(xx) smart_SN(x = xx,digits = 2))
res$Interpret = apply(res[,c("cistrans","PVAL_eQTL")],1,function(xx){
  ct = xx[1]
  pval = as.numeric(xx[2])
  ifelse(pval < 0.05,sprintf("%s eQTL",ct),"no eQTL")
})
res$Correct_Model = apply(res[,c("TReC_Dist","ASReC_Dist")],1,function(xx){
  chk_trec = (( true_PHI > 0 & xx[1] == "Negative Binomial" )
    | ( true_PHI == 0 & xx[1] == "Poisson" ))
  chk_trec

  chk_asrec = (( true_PSI > 0 & xx[2] == "Beta-Binomial" )
    | ( true_PSI == 0 & xx[2] == "Binomial" ))

  chk_trec = ifelse(chk_trec,"TReC-Yes","TReC-No")
  chk_asrec = ifelse(chk_asrec,"ASReC-Yes","ASReC-No")
  sprintf("%s;%s",chk_trec,chk_asrec)
})

# Simplified output
smart_rmcols(res,c("LRT_trec","LRT_trecase","LRT_cistrans","cistrans"))

From above, we see that modeling cell types through proportion and reference allele expression and correctly specifying the distribution of TReC and ASReC leads to no association or no cell type specific eQTLs.

OLS

We benchmark CSeQTL against OLS, an ordinary least squares model. This approach corresponds to Matrix eQTL [@shabalin2012matrix].

Bulk eQTL

For a bulk eQTL, the model adjusts for genotype and confounders. The code below fits and tests for a bulk eQTL using the simulated dataset.

ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX,
  RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = TRUE)
names(ols_out)

# Model estimates
summary(ols_out$lm_out)

# Effect sizes and hypothesis test
ols_out$out_df

Similar to CSeQTL's bulk testing, we have a false positive bulk eQTL.

Cell type-specific eQTLs

The code below will test for cell type-specific eQTLs with OLS.

ols_out = CSeQTL_linearTest(input = sim$dat,XX = sim$XX,
  RHO = sim$true_RHO,SNP = sim$true_SNP,MARG = FALSE)
names(ols_out)

# Model estimates
summary(ols_out$lm_out)

# Effect sizes and hypothesis tests
ols_out$out_df

Future Sections to create

Session Info

sessionInfo()

References



Try the CSeQTL package in your browser

Any scripts or data that you put into this service are public.

CSeQTL documentation built on April 24, 2026, 9:06 a.m.