opts_chunk$set(comment = NA, results = 'markup', tidy = F, message = F, warning = F, echo = T,
  fig.width = 3, fig.height = 3)

Settings

Parameters

cores <- 2

Packages

library(dplyr)
library(Matrix)

library(ggplot2)

library(rrBLUP)
library(qqman)
library(devtools)
load_all("~/git/variani/athaliana")
load_all("~/git/hemostat/lme4qtl")

Settings

theme_set(theme_light())

Load the data

Phenotypes

phen <- athaliana_phen(traits = "FRI", rows_order = "snp")
kable(head(phen))
phen <- mutate(phen, 
  log_FRI = log(FRI))

Pre-computed GRM

relmat <- athaliana_relmat()
image(Matrix(relmat))
image(Matrix(relmat[1:5, 1:5]))
kable(relmat[1:5, 1:5], digits = 2)

Genotypes

gdat <- athaliana_snp()

gdat_subset <- gdat[1:20]
kable(head(select(gdat, 1:5)))

Annotation

annot <- athaliana_annot()

Polygenic model

(poly <- relmatLmer(log_FRI ~ (1|id), phen, relmat = list(id = relmat)))

Association study of a small subset of snps

eigen-based model

run_GWAS <- function(phen, gdat, ...) 
{
  gmat <- t(gdat[-1])
  ids <- gdat[[1]]
  colnames(gmat) <- ids

  pheno <- phen[c("ecotype_id", "log_FRI")]
  geno <- bind_cols(data_frame(snp = names(gdat)[-1], chr = 0, pos = 0), as_data_frame(gmat))

  GWAS(as.data.frame(pheno), as.data.frame(geno), ...)
}  
assoc_mm <- as_data_frame(run_GWAS(phen, gdat_subset, P3D = FALSE, n.core = cores, K = relmat, plot = FALSE))

lmer model

assoc_lmer <- assocLmer(log_FRI ~ (1|id), phen, relmat = list(id = relmat), 
  data_snpcov = gdat_subset, method = "Wald",
  batch_size = 10, cores = cores)

Association study on all snps

dir <- "/home/andrey/git/variani/athaliana/results/gwas/FRI"
load(file.path(dir, "gwas_rrblup_emmax.RData"))
load(file.path(dir, "gwas_rrblup_mm.RData"))
load(file.path(dir, "gwas_lmer_wald.RData"))
load(file.path(dir, "gwas_lmer_lr.RData")) 
tab <- data_frame(marker = gwas_rrblup_emmax$marker, 
  chrom = gwas_rrblup_emmax$chrom, pos = gwas_rrblup_emmax$pos,
  pval_emmax = 10^(-gwas_rrblup_emmax$log_FRI),
  pval_mm = 10^(-gwas_rrblup_mm$log_FRI))

tab <- bind_cols(tab, data_frame(snp = gwas_lmer_wald$snp, pval_wald = gwas_lmer_wald$pval))
tab <- bind_cols(tab, data_frame(pval_lr = gwas_lmer_lr$pval))

tab <- filter(tab, pval_emmax < 1 | pval_mm < 1)

Manhattan plot

manhattan(tab, chr = "chrom", bp = "pos", p = "pval_wald", snp = "snp", suggestiveline = FALSE, genomewideline = -log10(0.05 / nrow(tab)))

QQ plot

qq(tab$pval_wald)

Are p-values consistent across methods?

mm vs. emmax

The figure below compares two methods mm and emmax in terms of diffence in p-values at log scale. Given that mm is an exact method and emmax is an approximation, the left panel shows that emmax underestimates the significance of effects.

The right panel points out that this is generally the case for the most significant effects (located on the right to the vertical line at -log10(pval) equal to 5). These SNPs are likely to influence the estimation of variance components, i.e. heritability.

ggplot(tab, aes(-log10(pval_emmax), -log10(pval_mm))) + geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = 3) + coord_equal()

ggplot(tab, aes(-log10(pval_mm), -log10(pval_mm/pval_emmax))) + geom_point() +
  geom_hline(yintercept = 1, linetype = 2) + geom_vline(xintercept = 5, linetype = 2)

mm vs. wald

Two methods mm and wald follows the same approach of using mixed models:

The figure shows the perfect match of the results for the two methods. That means two implementations of the method in R packages rrBLUP and lme4qtl are consistent.

ggplot(tab, aes(-log10(pval_wald), -log10(pval_mm))) + geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = 3) + coord_equal()

wald vs. lr

Two mehtods implemented in the lme4qtl package employs different tests in the mixed model framework: the Wald test and the LRT.

The figure below shows the different in p-values for these two methods.

ggplot(tab, aes(-log10(pval_lr), -log10(pval_wald))) + geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = 3) + coord_equal()

ggplot(tab, aes(-log10(pval_wald), -log10(pval_wald/pval_lr))) + geom_point() +
  geom_hline(yintercept = 1, linetype = 2) + geom_vline(xintercept = 5, linetype = 2)

The previous figure is not enough to get an idea about the distribution of differences. The next two histograms give a clue on what is happening. One histogram on the left is for all the SNPs, and another on the right is for a subset of SNPs where the difference in p-values at log scale is between -2 and 2,

ggplot(tab, aes( -log10(pval_wald/pval_lr))) + geom_histogram()

ggplot(tab, aes( -log10(pval_wald/pval_lr))) + geom_histogram() + xlim(c(-2, 2))

How many SNPs have a relatively large diffrenece in p-values at log scale? r with(tab, sum(abs(-log10(pval_wald / pval_lr)) > 1)) SNPs have the absolute value more than 1, and r with(tab, sum(abs(-log10(pval_wald / pval_lr)) > 2)) SNPs have the absolute value more than 2. The total number of SNPs is r nrow(tab).

Overall, one can not say that one or another method underestimates the significance of effects, because the histrograms are symmetric in relation to zero. However, it is clear from the scatter plot that the lr method underestimates the effects for those top SNPs which show the most significant association.



variani/athaliana documentation built on May 3, 2019, 4:34 p.m.