knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
GridLMM_GWAS
implement an (approximate) linear mixed effect model GWAS that allows for multiple random effects.
A typical GWAS includes a single random effect for Kinship - accounting for non-uniform relatedness among the observations.
GxEMMA is useful in situations when there are additional sources of covariation among the observations. Examples could be:
weights
parameter
can be added to GxEMMA with weights = num_observations
.diag(env) %*% K %*% diag(env)
Install the package, then load:
library(GridLMM)
Assuming n
individuals, and p
markers, the data inputs are:
X
. An n x p
matrix of marker genotypesmap
. A SNP map describing chromosome positions of each markerdata
. A data.frame that includes the phenotype, and columns for all covariates and IDs necessary for the modelrelmats
. The Kernel matrices needed for the model. The typical Genomic Relationship Matrix (GRM) Kernel can be calculated directly in the model,
or can be loaded directly.Here is a simulated data set for running below. Each genotype is grown in a single plot. Plots are arranged in a grid in the field.
n_geno = 300 n_chr = 5 n_markers_per_chr = 1000 # layout of Plots Plots = matrix(1:n_geno,nc = 10) # rectangular array of plots Plot_Row = row(Plots) Plot_Col = col(Plots) # create genotypes and randomize in field data = data.frame(Geno = paste0('Geno',1:n_geno), Plot = sample(Plots)) data$Row = Plot_Row[data$Plot] data$Col = Plot_Col[data$Plot] # create marker map and random genotypes map = expand.grid(pos = 1:n_markers_per_chr,Chr = 1:n_chr)[,2:1] map = data.frame(snp = paste0('snp_',1:nrow(map)),map) X = matrix(sample(c(0,2),size = n_geno*nrow(map),replace=T,prob = c(.1,.9)),ncol = nrow(map),dimnames = list(data$Geno,map$snp)) #rep(runif(nrow(map),.05,.5),each = n_geno) # simulate 1 effect per chromosome beta = rep(0,nrow(map)) for(i in 1:1) beta[(1:n_chr-1)*n_markers_per_chr + sample(1:n_markers_per_chr,n_chr)] = 1 # add background variation: modeled as small effects across all the markers beta_background = rnorm(length(beta),0,0.1) # create simulated data background_h2 = 0.6 # relative contribution of the background data$micro_env = rnorm(nrow(data)); data$micro_env = data$micro_env / sd(data$micro_env) data$background = X %*% beta_background; data$background = data$background / sd(data$background) data$SNPs = X %*% beta;data$SNPs = data$SNPs/sd(data$SNPs) data$y = data$SNPs + sqrt(background_h2) * data$background + sqrt(1-background_h2) * data$micro_env data$y = data$y/sd(data$y)
View the key data structures before proceeding
Our model will have two random effects:
Each random effect is modeled by including an appropriate Kernel matrix K
.
This matrix must be positive-semidefinite, and have rownames corresponding to a column of data
The kinship can be calculated directly by GxEMMA_GWAS
. It will be calculated as:
X_centered = sweep(X,2,colMeans(X),'-') # center marker genotypes K = tcrossprod(X_centered) / ncol(X_centered) rownames(K) = colnames(K) = rownames(X)
The spatial Kernel can be calculated using the squared distance, and a tuning parameter h
.
This parameter must be chosen. A "default" choice is given below.
library(KRLS) field = data[,c('Row','Col')] dists = as.matrix(dist(field)) h = median(dists)/2 K_plot = gausskernel(field,h^2/2); diag(K_plot)=1 # rownames(K_plot) = colnames(K_plot) = data$Plot
Simulate some spatial variation in the field
h2_spatial = 0.99 s2_spatial = 1 chol_K_plot = chol(h2_spatial*K_plot + diag(1-h2_spatial,nrow(data))) data$spatial_error = t(chol_K_plot) %*% rnorm(nrow(data),0,sqrt(s2_spatial)) field_error = 0*Plots field_error[data$Plot] = data$spatial_error image(field_error) data$y = .5*data$y + data$spatial_error #
The main function for running a GWAS is GridLMM_GWAS
gwas = GridLMM_GWAS( formula = y~1 + (1|Geno) + (1|Plot), # the same error model is used for each marker. It is specified similarly to lmer test_formula = ~1, # this is the model for each marker. ~1 means an intercept for the marker. ~1 + cov means an intercept plus a slope on `cov` for each marker reduced_formula = ~0, # This is the null model for each test. ~0 means just the error model. ~1 means the null includes the intercept of the marker, but not anything additional data = data, # The dataframe to look for terms from the 3 models weights = NULL, # optional observation-specific weights X = X, # The matrix of markers. Note: This can be of dimension n_g x p, where n_g is the number of unique genotypes. X_ID = 'Geno', # The column of the data that identifies each genotype. Each level of data$Geno should match a rowname of X h2_start = NULL, # An optional vector of h2s to use as starting values for each random effect in the model. If NULL, will be calculated from the error model using GridLMM_ML h2_step = 0.01, # step size per random effect for testing alternate values of h2 max_steps = 100, # maximum number of steps of size h2_step to take from h2_start X_map = map, # Optional. The marker positions. relmat = list(Plot = K_plot), # A list of Kernel matrices for the random effects. If X_ID (here Geno) is not included in this list, then it is calculated as tcrossprod(Xc)/ncol(Xc) where Xc is the centered (and optionally scaled) X. If any random effects are described in `error_model` but not provided here, the Kernel is assumed to be the identity matrix centerX = TRUE, # Should the markers be centered when calculating the GRM (only will be done if needed for calculating the GRM), scaleX = FALSE, # Should the markers be scaled to have constant variance when calculating the GRM? fillNAX = FALSE, # Should missing marker data be filled in with the mean allele frequency? method = 'REML', # REML = Wald test, ML = LRT, BF = calculate Bayes factors mc.cores = my_detectCores(), # How many cores should be used for parallel processing. Unless X is large, tends to actually be faster with mc.cores = 1 verbose = T # Should progress be printed to the screen? )
The output is a list with two elements
names(gwas)
results
contains a table of the gwas results
head(gwas$results)
You can use the qqman
package to make a manhattan plot and qqplot:
library(qqman) # manhattan plot with true SNPs highlighted manhattan(gwas$results,'Chr','pos','p_value_REML','snp',highlight = colnames(X)[beta != 0]) # qqplot qq(gwas$results$p_value_REML)
Here's the same data run with only the GRM as a random effect
gwas1 = GridLMM_GWAS( formula = y~1 + (1|Geno), # the same error model is used for each marker. It is specified similarly to lmer test_formula = ~1, # this is the model for each marker. ~1 means an intercept for the marker. ~1 + cov means an intercept plus a slope on `cov` for each marker reduced_formula = ~0, # This is the null model for each test. ~0 means just the error model. ~1 means the null includes the intercept of the marker, but not anything additional data = data, # The dataframe to look for terms from the 3 models weights = NULL, # optional observation-specific weights X = X, # The matrix of markers. Note: This can be of dimension n_g x p, where n_g is the number of unique genotypes. X_ID = 'Geno', # The column of the data that identifies each genotype. Each level of data$Geno should match a rowname of X h2_start = NULL, # An optional vector of h2s to use as starting values for each random effect in the model. If NULL, will be calculated from the error model using GridLMM_ML h2_step = 0.01, # step size per random effect for testing alternate values of h2 max_steps = 100, # maximum number of steps of size h2_step to take from h2_start X_map = map, # Optional. The marker positions. relmat = list(Plot = K_plot), # A list of Kernel matrices for the random effects. If X_ID (here Geno) is not included in this list, then it is calculated as tcrossprod(Xc)/ncol(Xc) where Xc is the centered (and optionally scaled) X. If any random effects are described in `error_model` but not provided here, the Kernel is assumed to be the identity matrix centerX = TRUE, # Should the markers be centered when calculating the GRM (only will be done if needed for calculating the GRM), scaleX = FALSE, # Should the markers be scaled to have constant variance when calculating the GRM? fillNAX = FALSE, # Should missing marker data be filled in with the mean allele frequency? method = 'REML', # Should the best model be selected by REML (if False, will be selected by ML) mc.cores = my_detectCores(), # How many cores should be used for parallel processing. Unless X is large, tends to actually be faster with mc.cores = 1 verbose = FALSE # Should progress be printed to the screen? )
Plot comparing the p-values between the methods. Note the higher p-values and better qq distribution when modeling the spatial correlation
plot(-log10(gwas$results$p_value_REML),-log10(gwas1$results$p_value_REML),col = (beta != 0)+1);abline(0,1) u = sort(-log10(runif(nrow(gwas$results)))) plot(sort(-log10(gwas$results$p_value_REML))~u,pch=21,cex=.5);abline(0,1) points(sort(-log10(gwas1$results$p_value_REML))~u,pch=21,cex=.5,bg=2,col=NULL) manhattan(gwas$results,'Chr','pos','p_value_REML','snp',highlight = colnames(X)[beta != 0],ylim = 1.1*c(0,max(-log10(gwas$results$p_value_REML)))) manhattan(gwas1$results,'Chr','pos','p_value_REML','snp',highlight = colnames(X)[beta != 0],ylim = 1.1*c(0,max(-log10(gwas$results$p_value_REML))))
This is following the idea from here: http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1005849
that suggested that just as you correct for genetic background with K = X %*% t(X) / ncol(X)
, you can also correct for GxE background
# add a random environmental vector to data data$env = rnorm(nrow(data)) gxe_gwas = GridLMM_GWAS( formula = y~1 + env + (1 + env|Geno), # the same error model is used for each marker. It is specified similarly to lmer test_formula = ~1 + env, # this is the model for each marker. ~1 means an intercept for the marker. ~1 + cov means an intercept plus a slope on `cov` for each marker reduced_formula = ~1, # This is the null model for each test. ~0 means just the error model. ~1 means the null includes the intercept of the marker, but not anything additional data = data, # The dataframe to look for terms from the 3 models weights = NULL, # optional observation-specific weights X = X, # The matrix of markers. Note: This can be of dimension n_g x p, where n_g is the number of unique genotypes. X_ID = 'Geno', # The column of the data that identifies each genotype. Each level of data$Geno should match a rowname of X h2_start = NULL, # An optional vector of h2s to use as starting values for each random effect in the model. If NULL, will be calculated from the error model using GridLMM_ML h2_step = 0.01, # step size per random effect for testing alternate values of h2 max_steps = 100, # maximum number of steps of size h2_step to take from h2_start X_map = map, # Optional. The marker positions. relmat = NULL, # A list of Kernel matrices for the random effects. If X_ID (here Geno) is not included in this list, then it is calculated as tcrossprod(Xc)/ncol(Xc) where Xc is the centered (and optionally scaled) X. If any random effects are described in `error_model` but not provided here, the Kernel is assumed to be the identity matrix centerX = TRUE, # Should the markers be centered when calculating the GRM (only will be done if needed for calculating the GRM), scaleX = FALSE, # Should the markers be scaled to have constant variance when calculating the GRM? fillNAX = FALSE, # Should missing marker data be filled in with the mean allele frequency? method = 'REML', # Should the best model be selected by REML (if False, will be selected by ML) mc.cores = my_detectCores(), # How many cores should be used for parallel processing. Unless X is large, tends to actually be faster with mc.cores = 1 verbose = FALSE # Should progress be printed to the screen? )
The GxE p-values are in the slot p_value_ML
, or p_value_REML.2
. The slot p_value_REML.1
contains the p-values for the genotype main effect.
qq-plots
u = sort(-log10(runif(nrow(gxe_gwas$results)))) plot(sort(-log10(gxe_gwas$results$p_value_REML.1))~u,pch=21,cex=.5,main = 'SNP main effect');abline(0,1) plot(sort(-log10(gxe_gwas$results$p_value_REML.2))~u,pch=21,cex=.5,main = 'SNP gxe effect');abline(0,1)
The above codes work for fairly small-scall GWAS applications. Once you get to thousands of individuals or 1 million + markers, holding the whole
marker matrix X
in memory can be limiting.
The best strategy currently is probably to break the analysis into chunks and analyze each chunk separately.
We can still re-use the pre-calculate V decompositions across all chunks to save time.
Commonly, data for GWAS is in PLINK format. We show here how to load that in chunks in R.
The steps are:
library(snpStats) write.plink(file.base = 'test_data',snps = as(X,'SnpMatrix'),id = data$Geno,phenotype = data$y, chromosome = map$Chr,position = map$pos)
This serves as a starting value for GridLMM_GWAS.
We use GridLMM_ML
here. Other programs like GCTA
could be used and would be much faster for this step.
If the marker-based kinship matrices are not prec-calculated, this needs to be done (ex. with GCTA
).
We do it here in R. It is possible to do this calculation in chunks as well.
K_G = tcrossprod(X)/ncol(X)
The key here is to specify a folder with save_V_folder
. This way the pre-calculated matrices are saved and can be re-used.
null_model = GridLMM_ML(formula = y~1 + (1|Geno) + (1|Plot),data = data,relmat = list(Geno = K_G, Plot = K_plot),REML = T,save_V_folder = 'V_folder',tolerance = 1e-3)
From the output, we want to extract two objects: The estimated variance componenets, and an object that stores paths to the pre-calculated matrices
The variance components can be accessed as:
null_model$results[,c('Geno.REML','Plot.REML')]
The setup object is:
V_setup = null_model$setup
markers = read.delim('test_data.bim',header = F,row.names = NULL) nrow(markers)
Divide into groups
chunks = split(1:nrow(markers),gl(10,nrow(markers)/10))
library(doParallel) library(foreach) full_results = foreach(chunk = chunks,i = 1:length(chunks),.combine = 'rbind') %do% { print(sprintf('Running chunk %d',i)) X_chunk = read.plink('test_data',select.snps = chunk) X_chunk = as(X_chunk$genotypes,'numeric') # convert to numeric matrix # Run the GWAS for the chunk results_chunk = GridLMM_GWAS(formula = y~1 + (1|Geno) + (1|Plot), test_formula = ~1, reduced_formula = ~0, data = data, X = X_chunk, X_ID = 'Geno', relmat = list(Geno = K_G, Plot = K_plot), V_setup = V_setup, # This tells the function to reuse existing matrices h2_start = null_model$results[,c('Geno.REML','Plot.REML')], method = 'REML', verbose = F) results_chunk$results # return just the results }
Compare results - should be the same as if we had run all at once.
plot(-log10(full_results$p_value_REML),-log10(gwas$results$p_value_REML));abline(0,1)
# build genetic kinship matrix K = tcrossprod(sweep(X,2,colMeans(X),'-')) K = K/mean(diag(K)) posterior = GridLMM_posterior(formula = y~1 + (1|Geno) + (1|Plot), data = data, relmat = list(Plot = K_plot,Geno = K), h2_divisions = 20)
library(ggplot2) ggplot(posterior$h2s_results,aes(x=Geno,y=Plot)) + geom_point(aes(size = posterior)) + xlim(c(0,1)) + ylim(c(0,1))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.