opts_chunk$set(comment = NA, results = 'markup', tidy = F, message = F, warning = F, echo = T, fig.width = 3, fig.height = 3)
cores <- 2
library(dplyr) library(Matrix) library(ggplot2) library(rrBLUP) library(qqman)
library(devtools) load_all("~/git/variani/athaliana") load_all("~/git/hemostat/lme4qtl")
theme_set(theme_light())
phen <- athaliana_phen(traits = "FRI", rows_order = "snp")
kable(head(phen))
phen <- mutate(phen, log_FRI = log(FRI))
relmat <- athaliana_relmat()
image(Matrix(relmat)) image(Matrix(relmat[1:5, 1:5]))
kable(relmat[1:5, 1:5], digits = 2)
gdat <- athaliana_snp() gdat_subset <- gdat[1:20]
kable(head(select(gdat, 1:5)))
annot <- athaliana_annot()
(poly <- relmatLmer(log_FRI ~ (1|id), phen, relmat = list(id = relmat)))
run_GWAS <- function(phen, gdat, ...) { gmat <- t(gdat[-1]) ids <- gdat[[1]] colnames(gmat) <- ids pheno <- phen[c("ecotype_id", "log_FRI")] geno <- bind_cols(data_frame(snp = names(gdat)[-1], chr = 0, pos = 0), as_data_frame(gmat)) GWAS(as.data.frame(pheno), as.data.frame(geno), ...) }
assoc_mm <- as_data_frame(run_GWAS(phen, gdat_subset, P3D = FALSE, n.core = cores, K = relmat, plot = FALSE))
assoc_lmer <- assocLmer(log_FRI ~ (1|id), phen, relmat = list(id = relmat), data_snpcov = gdat_subset, method = "Wald", batch_size = 10, cores = cores)
dir <- "/home/andrey/git/variani/athaliana/results/gwas/FRI" load(file.path(dir, "gwas_rrblup_emmax.RData")) load(file.path(dir, "gwas_rrblup_mm.RData")) load(file.path(dir, "gwas_lmer_wald.RData")) load(file.path(dir, "gwas_lmer_lr.RData"))
tab <- data_frame(marker = gwas_rrblup_emmax$marker, chrom = gwas_rrblup_emmax$chrom, pos = gwas_rrblup_emmax$pos, pval_emmax = 10^(-gwas_rrblup_emmax$log_FRI), pval_mm = 10^(-gwas_rrblup_mm$log_FRI)) tab <- bind_cols(tab, data_frame(snp = gwas_lmer_wald$snp, pval_wald = gwas_lmer_wald$pval)) tab <- bind_cols(tab, data_frame(pval_lr = gwas_lmer_lr$pval)) tab <- filter(tab, pval_emmax < 1 | pval_mm < 1)
manhattan(tab, chr = "chrom", bp = "pos", p = "pval_wald", snp = "snp", suggestiveline = FALSE, genomewideline = -log10(0.05 / nrow(tab)))
qq(tab$pval_wald)
mm
vs. emmax
The figure below compares two methods mm
and emmax
in terms of diffence in p-values at log scale.
Given that mm
is an exact method and emmax
is an approximation,
the left panel shows that emmax
underestimates the significance of effects.
The right panel points out that this is generally the case for the most significant effects
(located on the right to the vertical line at -log10(pval)
equal to 5
).
These SNPs are likely to influence the estimation of variance components, i.e. heritability.
ggplot(tab, aes(-log10(pval_emmax), -log10(pval_mm))) + geom_point() + geom_abline(slope = 1, intercept = 0, linetype = 3) + coord_equal() ggplot(tab, aes(-log10(pval_mm), -log10(pval_mm/pval_emmax))) + geom_point() + geom_hline(yintercept = 1, linetype = 2) + geom_vline(xintercept = 5, linetype = 2)
mm
vs. wald
Two methods mm
and wald
follows the same approach of using mixed models:
The figure shows the perfect match of the results for the two methods.
That means two implementations of the method in R packages rrBLUP
and lme4qtl
are consistent.
ggplot(tab, aes(-log10(pval_wald), -log10(pval_mm))) + geom_point() + geom_abline(slope = 1, intercept = 0, linetype = 3) + coord_equal()
wald
vs. lr
Two mehtods implemented in the lme4qtl
package employs different tests in the mixed model framework:
the Wald test and the LRT.
The figure below shows the different in p-values for these two methods.
ggplot(tab, aes(-log10(pval_lr), -log10(pval_wald))) + geom_point() + geom_abline(slope = 1, intercept = 0, linetype = 3) + coord_equal() ggplot(tab, aes(-log10(pval_wald), -log10(pval_wald/pval_lr))) + geom_point() + geom_hline(yintercept = 1, linetype = 2) + geom_vline(xintercept = 5, linetype = 2)
The previous figure is not enough to get an idea about the distribution of differences.
The next two histograms give a clue on what is happening.
One histogram on the left is for all the SNPs,
and another on the right is for a subset of SNPs
where the difference in p-values at log scale is between -2
and 2
,
ggplot(tab, aes( -log10(pval_wald/pval_lr))) + geom_histogram() ggplot(tab, aes( -log10(pval_wald/pval_lr))) + geom_histogram() + xlim(c(-2, 2))
How many SNPs have a relatively large diffrenece in p-values at log scale?
r with(tab, sum(abs(-log10(pval_wald / pval_lr)) > 1))
SNPs have the absolute value more than 1
,
and
r with(tab, sum(abs(-log10(pval_wald / pval_lr)) > 2))
SNPs have the absolute value more than 2
.
The total number of SNPs is r nrow(tab)
.
Overall, one can not say that one or another method underestimates the significance of effects,
because the histrograms are symmetric in relation to zero.
However, it is clear from the scatter plot that the lr
method underestimates the effects
for those top SNPs which show the most significant association.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.