knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval=FALSE )
library(simulateGP) library(mvtnorm) library(tidyverse)
If a set of SNPs have some effect on y, and those SNPs are correlated, can we calculate the expected marginal effect sizes?
$$ \textbf{y} = \textbf{Xb} + \textbf{e} $$
Intercept ignored for simplicity, so the genotyped matrix $\textbf{X}$ needs to be centered by subtracting the mean genotype value ($2p_j$) from each genotype.
Linear regression is solved by:
$$ \hat{\textbf{b}} = \textbf{(X'X)}^{-1}\textbf{X'y} $$
where
$$ var(\hat{\textbf{b}}) = \sigma^2_j (\textbf{X'X})^{-1} $$
and $\sigma^2_j$ is the residual variance given by
$$ (1-R^2)var(y) $$
and
$$ R^2 = \frac{\hat{\boldsymbol{b}}' \boldsymbol{X}'\boldsymbol{y} \boldsymbol{\hat{\beta}}}{var(y)} $$
We can obtain the causal effects from the marginal estimates using:
$$ \boldsymbol{\hat{b}} = \boldsymbol{(X'X)^{-1}D\hat{\beta}} $$
Where $\boldsymbol{D}$ is the diagonal matrix of $\boldsymbol{X'X}$ (i.e. the sums of squares for each of the SNPs).
Therefore, to generate a set of marginal estimates from a given set of causal effects and a set of correlated SNPs:
$$ \boldsymbol{\hat{\beta}} = \boldsymbol{\hat{b}X'XD^{-1}} $$
We would like to generate this just from a correlation matrix, rather than $\boldsymbol{X'X}$ which is the matrix of the sums of squares of X. Given that
$$ \rho_{jk} = \frac{\sum^n{ x_j x_k}}{\sqrt{\sum^n{x_j^2}\sum^n{x_k^2}}} $$
We can generate the $\boldsymbol{X'X}$ matrix by first calculating the sums of squares of each of the SNPs such that, which is represented in the diagonal matrix $D$
$$ D_{j,j} = \sum^n{x_j} = 2p_j(1-p_j)n $$
and re-writing X'X as
$$ \boldsymbol{X'X} = \boldsymbol{\rho} \sqrt{\boldsymbol{D}} $$
Finally
$$ \boldsymbol{\beta} = \boldsymbol{D^{-1/2}} \boldsymbol{\rho} \boldsymbol{D^{1/2}} \boldsymbol{b} $$
To generate the standard errors we need to allow for the sampling variance of each neighboring SNP to be correlated, such that
$$ \boldsymbol{Cov(\hat{\beta})} = \boldsymbol{S} \boldsymbol{\rho} \boldsymbol{S} $$
Where S is a diagonal matrix of expected marginal standard errors for each SNP.
calcs <- function(x, y, b) { xpx <- t(x) %*% x D <- matrix(0, ncol(x), ncol(x)) diag(D) <- diag(xpx) betahat <- gwas(y, x)$bhat bhat <- drop(solve(xpx) %*% D %*% betahat) betahatc <- b %*% xpx %*% solve(D) %>% drop rho <- cor(x) betahatrho <- b %*% rho %>% drop tibble(b, bhat, betahat, betahatc, betahatrho) } n <- 10000 nsnp <- 20 sigma <- matrix(0.7, nsnp, nsnp) diag(sigma) <- 1 x <- rmvnorm(n, rep(0, nsnp), sigma) b <- rnorm(nsnp) * 100 y <- x %*% b + rnorm(n) res <- calcs(x, y, b) plot(res$b ~ res$bhat) plot(res$betahat ~ res$betahatc) plot(res$betahat ~ res$betahatrho)
This works well. Now try with two correlated binomial variables
nsnp <- 2 x <- simulateGP:::correlated_binomial(n, 0.3, 0.3, 0.7) b <- rnorm(nsnp) y <- x %*% b + rnorm(n) res <- calcs(x, y, b) res
Not working so well now. Try scaling
x <- simulateGP:::correlated_binomial(n, 0.3, 0.3, 0.7) %>% scale b <- rnorm(nsnp) y <- x %*% b + rnorm(n) res <- calcs(x, y, b) res
This works - it's because there is no intercept term in the model!
Now try with LD reference panel data
read_x <- function(variants, bfile, plink_bin=genetics.binaRies::get_plink_binary()) { # Make textfile shell <- ifelse(Sys.info()['sysname'] == "Windows", "cmd", "sh") fn <- tempfile() write.table(data.frame(variants), file=fn, row.names=F, col.names=F, quote=F) fun1 <- paste0( shQuote(plink_bin, type=shell), " --bfile ", shQuote(bfile, type=shell), " --extract ", shQuote(fn, type=shell), " --recode A ", " --out ", shQuote(fn, type=shell) ) system(fun1) x <- data.table::fread(paste0(fn, ".raw")) %>% {.[,-c(1:6)]} %>% as.matrix() unlink(paste0(fn, ".raw")) return(x) } greedy_remove <- function(r) { diag(r) <- 0 flag <- 1 rem <- c() nom <- colnames(r) while(flag == 1) { message("iteration") count <- apply(r, 2, function(x) sum(x >= 0.99)) if(any(count > 0)) { worst <- which.max(count)[1] rem <- c(rem, names(worst)) r <- r[-worst,-worst] } else { flag <- 0 } } return(which(nom %in% rem)) } pop <- "EUR" ldref <- paste0("/Users/gh13047/repo/mr-base-api/app/ld_files/", pop) bim <- data.table::fread(paste0(ldref, ".bim")) regionfile <- paste0("/Users/gh13047/repo/gwasglue/inst/extdata/ldetect/", pop, ".bed") regions <- data.table::fread(regionfile, header=TRUE) %>% dplyr::mutate( chr=as.numeric(gsub("chr", "", chr)), start=as.numeric(start), stop=as.numeric(stop) ) %>% dplyr::as_tibble() region1 <- subset(bim, V1 == regions$chr[1] & V4 > regions$start[1] & V4 < regions$stop[1])$V2 x <- read_x(region1, ldref)[,1008:1030]
When calculating causal effects from marginal effects need to invert the correlation matrix. To avoid matrix being singular, remove any correlations that are >= 0.99
r <- cor(x) rem <- greedy_remove(r) x <- x[,-rem] # x <- scale(x) normalise_x <- function(x) { p <- colMeans(x) t(t(x) - p) } xn <- normalise_x(x) xs <- scale(x) colMeans(xn) colMeans(xs) colMeans(x) apply(x, 2, var) apply(xn, 2, var) apply(xs, 2, var) xo <- x x <- xn x <- xs x <- xo
Simulate
n <- nrow(x) nsnp <- ncol(x) b <- rnorm(nsnp) * 100 y <- x %*% b + rnorm(n) res <- calcs(x, y, b) res plot(res$b ~ res$bhat)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.