knitr::opts_chunk$set(echo = TRUE)
set.seed(1)
## Load packages
library(devtools)
library(dplyr)
library(ggplot2)
library(cowplot)
library(xtable)

load_all('.')
# library('gws')
library(moiR)
library(bigsnpr)
library(bigstatsr)

The problem and the model

Given a genotyped population of $N$ individuals and $M$ SNPs, we have the $X$ matrix. From $X$ this matrix we can obtain the covariance matrix $V$ between all SNPs as $\frac{1}{N} X^T X$, i.e. the linkage disequilibrium matrix. The diagonal of this matrix is $D$.

For this population of genotypes, we have a Normally distributed phenotype $Y$. Thus, a regular multivariate linear model can be written as $Y = X\beta + \epsilon$, and the $\beta$ is tipically solved in GWA studies marginally as $\beta_m = (X^TX)^{-1} X^T Y$. The problem comes when the different SNPs are not independent variables. In such case, marginal effects will not reflect true effects. Biologically, this is because of a linkage dissequilibrium confounder.

A trivial model extension can be done in the framework of multiple regression. The \beta effects can be estimated accounting for the $V$ covariance matrix as: $\beta = (X^TX)^{-1} V^{-1} X^T Y$. This can be also estimated directly from the marginal effects as: $\beta=V^{-1}D\beta_m$.

The above development requires a invertible $V$. This is not possible when there are SNPs in complete linkage or when the number of SNPs is larger than the number of individuals, because $V$ is rank deficient. In such case, an approximation of the inverted matrix could be achieved by several options. The algorithm of Higham to find the nearest positive definite matrix, would find, by definition, the closest invertible matrix. Choleschy factorization can help but not always. Finally, we can use the Moore-Penrose generalized inverse of a matrix.


Application with simulated genotype and phenotype data

Get the genome data

### Real data
# genomes<-readRDS("genome.rda")
data(genomes)

G <- genomes$genotypes
X. = G[,sample(1:ncol(G),size = 250)]
N=nrow(X.)
M=ncol(X.)

### Dummy data
Xdummy<-X.
Xdummy[is.numeric(Xdummy)]<-sample(c(0,1,2),size = length(Xdummy),replace = TRUE)

### Decide real vs dummy
X.<-Xdummy
hist(X., xlab="allele dossage",main="")

Mean center and variance scale

# Center and scale genome matrix
X. = apply(X., 2, function(x) { x[ x== (-1)] <- 1 ; x})
X. = apply(X., 2, function(x) { x[ is.na(x)] <- 1 ; x})
# X. = apply(X., 2, function(x) { x - 1 }) ## this centers the matrix in 0
X = apply(X.,2, function(x) {
    mu=mean(x)
    sig=sd(x)+1e-10
    (x-mu)/sig
  } )

# Check it worked
# apply(X,2,mean)
# apply(X,2,var)

Calculation of LD based on raw covariation

# Calculate
V= t(X) %*% X
V= 1/N * t(X) %*% X
D = diag(V)

# Check it works, compared to base function in R
plot(var(X),V)

Generalized inverse matrix

Vinv=MASS::ginv(V)
Vinv=solve(V)

Association with a Gaussian trait

set.seed(0)
Y = rnorm(N,mean=1,sd=1)
Y = meanvarcent(Y)
qplot(Y,geom="histogram",main="Gaussian phenotype")
Calculate coefficients
b= solve(t(X)%*%X) %*% t(X) %*% Y 
b2=1/N * Vinv %*% t(X) %*% Y 
# b == b2
# round(b,digits = 2) == round(b2,digits = 2) # doublecheck

bgwa=  solve(t(X)%*%X) %*% solve(diag(D)) %*% t(X) %*% Y # the diagonal matrix not necessary, just visualization # Incorrect. This is multiple regression, so it generates partial regression coefficients.

bgwa=sapply(1:M,function(m) 
                  solve(t(X[,m])%*%X[,m]) %*% t(X[,m]) %*% Y
            )

var_bgwa= sapply(1:M,
                 function(m) 
                   solve(N*t(X[,m])%*%X[,m]) %*% t(Y - (X[,m]*bgwa[m])) %*% (Y - (X[,m]*bgwa[m]))^2
                   # solve(N*t(X[,m])%*%X[,m]) %*% t(Y - (X[,m]*bgwa[m])) %*% (Y - (X[,m]*bgwa[m]))
                 )

sd_b=sapply(1:M,
                 function(m) 
                   solve(N*t(X[,m])%*%X[,m]) %*% t(Y - (X[,m]*b[m])) %*% (Y - (X[,m]*b[m])) ^2
                   # N*Vinv %*% t(Y - (X[,m]*b[m])) %*% (Y - (X[,m]*b[m]))
                 )
# var_b= sd_b %*% Vinv


lm
plot(var_b,b)
plot(var_bgwa,bgwa)




# bgwa.lm=sapply(1:M,function(m)  # Double check
#                    lm(Y~X[,m])$coefficients[2]
#             )
# 
# var_bgwa.lm= sapply(1:M,
#                  function(m) 
#                     sqrt(diag(vcov( lm(Y~X[,m]) )))[2] ^ 2
#                  )
bg.lm=lm(Y~X)$coefficients[-1] 
var_b.lm=diag(vcov( lm(Y~X) ))[-1] 
plot(
  var_b.lm,
   sd_b %*% Vinv
)

plot(bg.lm,b)
plot(var_b.lm,bg.lm)
plot(sd_b,var_b.lm)
plot(var_b,var_b.lm)




MSE_bgwa= 1/N * sum( (Y - (X %*% bgwa))^2 )


plot(bgwa,var_bgwa)
plot(bgwa.lm,var_bgwa.lm)


MSE_b= 1/N * sum( (Y - (X %*% b))^2 )





bgwa2=1/N*  solve(diag(D)) %*% t(X) %*% Y # the diagonal matrix not necessary, just visualization

bgwa3= solve(t(X)%*%X) %*% solve(diag(D)) %*% t(X) %*% Y # the diagonal matrix not necessary, just visualization

plot(bgwa,bgwa2)
plot(bgwa,bgwa3)
bgwa==bgwa2
bgwa==bgwa3

lm(Y~X[,1])$coefficients[2]
bgwa[1]
bgwa2[1]
bgwa3[1]
b[1]

plot(b,bgwa3)

plot(lm(Y~X[,1])$coefficients[2], bgwa[1])

# the diagonal matrix not necessary, just visualization
boxplot(Y~X.[,1])
lm(Y~X[,1])
bgwa[1]
b[1]

b2= Vinv %*% bgwa 
b[1]

b==b2

plot(b,b2)

p0<-ggdotscolor(x=b,y=bgwa,
            ylab =  expression(beta[gwa]),
            xlab =  expression(beta)
            ) %>% addggregression(se=FALSE)
p0
p1<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=bgwa)) +xlab(expression(beta[gwa]))
p2<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=b)) +xlab(expression(beta))

plot_grid(p0,plot_grid(p1,p2,ncol=2),
          ncol=1)

Calculate the residual variance

MSE= 1/N * sum( (Y - (X %*% b))^2 )
var_b= rep(MSE,M) %*% solve(t(X) %*% X)

MSEgwa= 1/N * t(Y - (X %*% bgwa)) %*% (Y - (X %*% bgwa))
var_bgwa= solve(N*t(X)%*%X) %*% t(Y - (X %*% bgwa)) %*% (Y - (X %*% bgwa))


print("The MSE of marginal analysis is: ",MSEgwa)
print("The MSE of conditional analysis is: ",MSE)

# out=matrix(c(MSEgwa,MSE))
# rownames(out)<-c("bgwa","b")
# colnames(out)<-"MSE"
# tab <- xtable(out,digits=c(2,2))
# print(tab, type="html")

Association with a Gaussian trait generated from different architectures

Gaussian effects

eff=rnorm(ncol(X),mean = 0,sd = 1)
Y=X. %*% eff # this is right!
Y = meanvarcent(Y)
qplot(Y,geom="histogram",main="Phenotype from Gaussian SNP effects")
Y.gauss=Y

Exponential effects

eff=rgamma(ncol(X),0.1,0.1)
eff =eff /sd(eff)
eff=eff* sample(c(-1,1),ncol(X),replace = TRUE)
eff=sample(c(-1,0,1),replace = TRUE, prob = c(0.025,0.95,0.025),size = ncol(X))

Y=X. %*% eff # this is right!
Y = meanvarcent(Y)
qplot(Y,geom="histogram",main="Phenotype from exponential SNP effects")
Y.skewed=Y
Calculate coefficients
b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.gauss # check if dividing is needed by individual
bgwa=  solve(t(X)%*%X) %*% t(X) %*% Y.gauss # the diagonal matrix not necessary, just visualization


p1<-ggplot(data.frame(eff,b,bgwa)) +
  geom_point(aes(x=eff,y=bgwa),shape=1) +
  stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3)))+

  geom_point(aes(x=eff,y=b),shape=1,col="red") +
  stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+

  ylab(expression(paste("estimated ",beta)))+
  xlab("True effect")+
  guides(color=FALSE)
# p1

b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.skewed
bgwa=  solve(t(X)%*%X) %*% t(X) %*% Y.skewed

p2<-ggplot(data.frame(eff,b,bgwa)) +
  geom_point(aes(x=eff,y=bgwa),shape=1) +
  stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3)))+

  geom_point(aes(x=eff,y=b),shape=1,col="red") +
  stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+
  ylab(expression(paste("estimated ",beta)))+
  xlab("True effect")+
  guides(color=FALSE)
# p2

plot_grid(p1,p2,ncol=2,labels=c("Gaussian SNP effects","Exponential SNP effects"))

Application with real genotype data

### Real data
X. = G[,sample(1:ncol(G),size = 250)]
N=nrow(X.)
M=ncol(X.)

Mean center and variance scale

# Center and scale genome matrix
X. = apply(X., 2, function(x) { x[ x== (-1)] <- 1 ; x})
X. = apply(X., 2, function(x) { x[ is.na(x)] <- 1 ; x})
# X. = apply(X., 2, function(x) { x - 1 }) ## this centers the matrix in 0
X = apply(X.,2, function(x) {
    mu=mean(x)
    sig=sd(x)+1e-10
    (x-mu)/sig
  } )

Calculation of LD based on raw covariation

# Calculate
V = 1/N * t(X) %*% X
D = diag(V)

# Check it works, compared to base function in R
plot(var(X),V)

Generalized inverse matrix

Vinv=MASS::ginv(V)

Association with a Gaussian trait generated from different architectures

Gaussian effects

eff=rnorm(ncol(X),mean = 0,sd = 1)
Y=X. %*% eff # this is right!
Y = meanvarcent(Y)
qplot(Y,geom="histogram",main="Gaussian SNP effects")
Y.gauss=Y

Exponential effects

eff=rgamma(ncol(X),0.1,0.1)
eff =eff /sd(eff)
eff=eff* sample(c(-1,1),ncol(X),replace = TRUE)
eff=sample(c(-1,0,1),replace = TRUE, prob = c(0.025,0.95,0.025),size = ncol(X))

Y=X. %*% eff # this is right!
Y = meanvarcent(Y)
qplot(Y,geom="histogram",main="Skewed SNP effects")
Y.skewed=Y
Calculate coefficients
b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.gauss # check if dividing is needed by individual
bgwa=  solve(t(X)%*%X) %*% t(X) %*% Y.gauss # the diagonal matrix not necessary, just visualization


p1<-ggplot(data.frame(eff,b,bgwa)) +
  geom_point(aes(x=eff,y=bgwa),shape=1) +
  stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3))) +

  geom_point(aes(x=eff,y=b),shape=1,col="red") +
  stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+

  ylab(expression(paste("estimated ",beta)))+
  xlab("True effect")+
  guides(color=FALSE)
# p1

b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.skewed
bgwa=  solve(t(X)%*%X) %*% t(X) %*% Y.skewed

p2<-ggplot(data.frame(eff,b,bgwa)) +
  geom_point(aes(x=eff,y=bgwa),shape=1) +
  stat_smooth(aes(x=eff,y=bgwa),method="glm",se=FALSE,col="black")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=3,label=format(cor.test(bgwa,eff)$estimate,digits=3)))+

  geom_point(aes(x=eff,y=b),shape=1,col="red") +
  stat_smooth(aes(x=eff,y=b),method="glm",se=FALSE,col="red")+
  geom_text(aes(x=-Inf,y=+Inf,hjust=-1,vjust=5,label=format(cor.test(b,eff)$estimate,digits=3),color="red"))+
  ylab(expression(paste("estimated ",beta)))+
  xlab("True effect")+
  guides(color=FALSE)
# p2

plot_grid(p1,p2,ncol=2,labels=c("Gaussian SNP effects","Exponential SNP effects"))

Application with real genotype data and real fitness data

### Real data
X. = G[,sample(1:ncol(G),size = 250)]
N=nrow(X.)
M=ncol(X.)

Mean center and variance scale

# Center and scale genome matrix
X. = apply(X., 2, function(x) { x[ x== (-1)] <- 1 ; x})
X. = apply(X., 2, function(x) { x[ is.na(x)] <- 1 ; x})
# X. = apply(X., 2, function(x) { x - 1 }) ## this centers the matrix in 0
X = apply(X.,2, function(x) {
    mu=mean(x)
    sig=sd(x)+1e-10
    (x-mu)/sig
  } )

Calculation of LD based on raw covariation

# Calculate
V = 1/N * t(X) %*% X
D = diag(V)

# Check it works, compared to base function in R
plot(var(X),V)

Generalized inverse matrix

Vinv=MASS::ginv(V)

Association

Get phenotypes

Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Survival_fruit_mli")], by.y="id",all.x=T)$Survival_fruit_mli
Y=relative(Y)
Y = meanvarcent(Y)
qplot(Y,geom="histogram",main="Survival to reproduction (Madrid+drought)")
Y.bad=Y
Y.bad[is.na(Y.bad)]<-mean(Y.bad,na.rm=TRUE) # Mean impute

Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Survival_fruit_thi")], by.y="id",all.x=T)$Survival_fruit_thi
Y=relative(Y)
Y = meanvarcent(Y)
qplot(Y,geom="histogram",main="Survival to reproduction (Tübingen+water)")
Y.good=Y
Y.good[is.na(Y.good)]<-mean(Y.good,na.rm=TRUE) # Mean impute
Calculate coefficients
b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.bad # check if dividing is needed by individual
bgwa=  solve(t(X)%*%X) %*% t(X) %*% Y.bad # the diagonal matrix not necessary, just visualization

p1=ggdotscolor(b,bgwa,
            ylab =  expression(beta[gwa]),
            xlab =  expression(beta)
            ) %>% addggregression(se=FALSE)
p1=p1 + geom_hline(yintercept = 0,lty="dashed",size=0.2)+ geom_vline(xintercept = 0,lty="dashed",size=0.2)+ ggtitle("Madrid+Drought")
p2<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=bgwa)) +xlab(expression(beta[gwa]))
p3<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=b)) +xlab(expression(beta))
plot_grid(p1,plot_grid(p2,p3,ncol=2),
          ncol=1)


b= solve(t(X)%*%X) %*% Vinv %*% t(X) %*% Y.good
bgwa=  solve(t(X)%*%X) %*% t(X) %*% Y.good

p2=ggdotscolor(b,bgwa,
            ylab =  expression(beta[gwa]),
            xlab =  expression(beta)
            ) %>% addggregression(se=FALSE)
p2=p2 + geom_hline(yintercept = 0,lty="dashed",size=0.2)+ geom_vline(xintercept = 0,lty="dashed",size=0.2) + ggtitle("Tübingen+Water")

p4<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=bgwa)) +xlab(expression(beta[gwa])) 
p5<-ggplot(data.frame(bgwa)) +geom_histogram(aes(x=b)) +xlab(expression(beta))
plot_grid(p2,plot_grid(p4,p5,ncol=2),rel_heights =  2,rel_widths = 1,
          ncol=1)


MoisesExpositoAlonso/popgensim documentation built on May 7, 2019, 8:57 p.m.