knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

GridLMM_GWAS implement an (approximate) linear mixed effect model GWAS that allows for multiple random effects.

A typical GWAS includes a single random effect for Kinship - accounting for non-uniform relatedness among the observations.

GxEMMA is useful in situations when there are additional sources of covariation among the observations. Examples could be:

Package

Install the package, then load:

library(GridLMM)

Load data

Assuming n individuals, and p markers, the data inputs are:

Example data

Here is a simulated data set for running below. Each genotype is grown in a single plot. Plots are arranged in a grid in the field.

n_geno = 300
n_chr = 5
n_markers_per_chr = 1000

# layout of Plots
Plots = matrix(1:n_geno,nc = 10) # rectangular array of plots
Plot_Row = row(Plots)
Plot_Col = col(Plots)

# create genotypes and randomize in field
data = data.frame(Geno = paste0('Geno',1:n_geno), Plot = sample(Plots))
data$Row = Plot_Row[data$Plot]
data$Col = Plot_Col[data$Plot]

# create marker map and random genotypes
map = expand.grid(pos = 1:n_markers_per_chr,Chr = 1:n_chr)[,2:1]
map = data.frame(snp = paste0('snp_',1:nrow(map)),map)
X = matrix(sample(c(0,2),size = n_geno*nrow(map),replace=T,prob = c(.1,.9)),ncol = nrow(map),dimnames = list(data$Geno,map$snp)) #rep(runif(nrow(map),.05,.5),each = n_geno)

# simulate 1 effect per chromosome
beta = rep(0,nrow(map))
for(i in 1:1) beta[(1:n_chr-1)*n_markers_per_chr + sample(1:n_markers_per_chr,n_chr)] = 1

# add background variation: modeled as small effects across all the markers
beta_background = rnorm(length(beta),0,0.1)

# create simulated data
background_h2 = 0.6 # relative contribution of the background
data$micro_env = rnorm(nrow(data)); data$micro_env = data$micro_env / sd(data$micro_env)
data$background = X %*% beta_background; data$background = data$background / sd(data$background)
data$SNPs = X %*% beta;data$SNPs = data$SNPs/sd(data$SNPs)
data$y = data$SNPs + sqrt(background_h2) * data$background + sqrt(1-background_h2) * data$micro_env
data$y = data$y/sd(data$y)

View the key data structures before proceeding

Preparing the relmats

Our model will have two random effects:

  1. Kinship: Typical additive relationship matrix.
  2. Spatial variation.

Each random effect is modeled by including an appropriate Kernel matrix K. This matrix must be positive-semidefinite, and have rownames corresponding to a column of data

The kinship can be calculated directly by GxEMMA_GWAS. It will be calculated as:

X_centered = sweep(X,2,colMeans(X),'-') # center marker genotypes
K = tcrossprod(X_centered) / ncol(X_centered)
rownames(K) = colnames(K) = rownames(X)

The spatial Kernel can be calculated using the squared distance, and a tuning parameter h. This parameter must be chosen. A "default" choice is given below.

library(KRLS)
field = data[,c('Row','Col')]
dists = as.matrix(dist(field))
h = median(dists)/2
K_plot = gausskernel(field,h^2/2); diag(K_plot)=1 # 
rownames(K_plot) = colnames(K_plot) = data$Plot

Simulate some spatial variation in the field

h2_spatial = 0.99
s2_spatial = 1
chol_K_plot = chol(h2_spatial*K_plot + diag(1-h2_spatial,nrow(data)))
data$spatial_error = t(chol_K_plot) %*% rnorm(nrow(data),0,sqrt(s2_spatial))
field_error = 0*Plots
field_error[data$Plot] = data$spatial_error
image(field_error)
data$y = .5*data$y + data$spatial_error #

Run GWAS

The main function for running a GWAS is GridLMM_GWAS

gwas = GridLMM_GWAS(
                        formula = y~1 + (1|Geno) + (1|Plot), # the same error model is used for each marker. It is specified similarly to lmer
                        test_formula = ~1, # this is the model for each marker. ~1 means an intercept for the marker. ~1 + cov means an intercept plus a slope on `cov` for each marker
                        reduced_formula = ~0, # This is the null model for each test. ~0 means just the error model. ~1 means the null includes the intercept of the marker, but not anything additional
                        data = data, # The dataframe to look for terms from the 3 models
                        weights = NULL, # optional observation-specific weights
                        X = X, # The matrix of markers. Note: This can be of dimension n_g x p, where n_g is the number of unique genotypes.
                        X_ID = 'Geno', # The column of the data that identifies each genotype. Each level of data$Geno should match a rowname of X
                        h2_start = NULL, # An optional vector of h2s to use as starting values for each random effect in the model. If NULL, will be calculated from the error model using GridLMM_ML
                        h2_step = 0.01, # step size per random effect for testing alternate values of h2
                        max_steps = 100, # maximum number of steps of size h2_step to take from h2_start
                        X_map = map, # Optional. The marker positions.
                        relmat = list(Plot = K_plot), # A list of Kernel matrices for the random effects. If X_ID (here Geno) is not included in this list, then it is calculated as tcrossprod(Xc)/ncol(Xc) where Xc is the centered (and optionally scaled) X. If any random effects are described in `error_model` but not provided here, the Kernel is assumed to be the identity matrix
                        centerX = TRUE, # Should the markers be centered when calculating the GRM (only will be done if needed for calculating the GRM),
                        scaleX = FALSE, # Should the markers be scaled to have constant variance when calculating the GRM?
                        fillNAX = FALSE, # Should missing marker data be filled in with the mean allele frequency?
                        method = 'REML', # REML = Wald test, ML = LRT, BF = calculate Bayes factors
                        mc.cores = my_detectCores(), # How many cores should be used for parallel processing. Unless X is large, tends to actually be faster with mc.cores = 1
                        verbose = T # Should progress be printed to the screen?
)

The output is a list with two elements

names(gwas)

results contains a table of the gwas results

head(gwas$results)

You can use the qqman package to make a manhattan plot and qqplot:

library(qqman)
# manhattan plot with true SNPs highlighted
manhattan(gwas$results,'Chr','pos','p_value_REML','snp',highlight = colnames(X)[beta != 0])

# qqplot 
qq(gwas$results$p_value_REML)

Here's the same data run with only the GRM as a random effect

gwas1 = GridLMM_GWAS(
                        formula = y~1 + (1|Geno), # the same error model is used for each marker. It is specified similarly to lmer
                        test_formula = ~1, # this is the model for each marker. ~1 means an intercept for the marker. ~1 + cov means an intercept plus a slope on `cov` for each marker
                        reduced_formula = ~0, # This is the null model for each test. ~0 means just the error model. ~1 means the null includes the intercept of the marker, but not anything additional
                        data = data, # The dataframe to look for terms from the 3 models
                        weights = NULL, # optional observation-specific weights
                        X = X, # The matrix of markers. Note: This can be of dimension n_g x p, where n_g is the number of unique genotypes.
                        X_ID = 'Geno', # The column of the data that identifies each genotype. Each level of data$Geno should match a rowname of X
                        h2_start = NULL, # An optional vector of h2s to use as starting values for each random effect in the model. If NULL, will be calculated from the error model using GridLMM_ML
                        h2_step = 0.01, # step size per random effect for testing alternate values of h2
                        max_steps = 100, # maximum number of steps of size h2_step to take from h2_start
                        X_map = map, # Optional. The marker positions.
                        relmat = list(Plot = K_plot), # A list of Kernel matrices for the random effects. If X_ID (here Geno) is not included in this list, then it is calculated as tcrossprod(Xc)/ncol(Xc) where Xc is the centered (and optionally scaled) X. If any random effects are described in `error_model` but not provided here, the Kernel is assumed to be the identity matrix
                        centerX = TRUE, # Should the markers be centered when calculating the GRM (only will be done if needed for calculating the GRM),
                        scaleX = FALSE, # Should the markers be scaled to have constant variance when calculating the GRM?
                        fillNAX = FALSE, # Should missing marker data be filled in with the mean allele frequency?
                        method = 'REML', # Should the best model be selected by REML (if False, will be selected by ML)
                        mc.cores = my_detectCores(), # How many cores should be used for parallel processing. Unless X is large, tends to actually be faster with mc.cores = 1
                        verbose = FALSE # Should progress be printed to the screen?
)

Plot comparing the p-values between the methods. Note the higher p-values and better qq distribution when modeling the spatial correlation

plot(-log10(gwas$results$p_value_REML),-log10(gwas1$results$p_value_REML),col = (beta != 0)+1);abline(0,1)

u = sort(-log10(runif(nrow(gwas$results))))
plot(sort(-log10(gwas$results$p_value_REML))~u,pch=21,cex=.5);abline(0,1)
points(sort(-log10(gwas1$results$p_value_REML))~u,pch=21,cex=.5,bg=2,col=NULL)


manhattan(gwas$results,'Chr','pos','p_value_REML','snp',highlight = colnames(X)[beta != 0],ylim = 1.1*c(0,max(-log10(gwas$results$p_value_REML))))
manhattan(gwas1$results,'Chr','pos','p_value_REML','snp',highlight = colnames(X)[beta != 0],ylim = 1.1*c(0,max(-log10(gwas$results$p_value_REML))))

Run GWAS for GxE using K_g and K_gxe

This is following the idea from here: http://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1005849 that suggested that just as you correct for genetic background with K = X %*% t(X) / ncol(X), you can also correct for GxE background

# add a random environmental vector to data
data$env = rnorm(nrow(data))

gxe_gwas = GridLMM_GWAS(
                        formula = y~1 + env + (1 + env|Geno), # the same error model is used for each marker. It is specified similarly to lmer
                        test_formula = ~1 + env, # this is the model for each marker. ~1 means an intercept for the marker. ~1 + cov means an intercept plus a slope on `cov` for each marker
                        reduced_formula = ~1, # This is the null model for each test. ~0 means just the error model. ~1 means the null includes the intercept of the marker, but not anything additional
                        data = data, # The dataframe to look for terms from the 3 models
                        weights = NULL, # optional observation-specific weights
                        X = X, # The matrix of markers. Note: This can be of dimension n_g x p, where n_g is the number of unique genotypes.
                        X_ID = 'Geno', # The column of the data that identifies each genotype. Each level of data$Geno should match a rowname of X
                        h2_start = NULL, # An optional vector of h2s to use as starting values for each random effect in the model. If NULL, will be calculated from the error model using GridLMM_ML
                        h2_step = 0.01, # step size per random effect for testing alternate values of h2
                        max_steps = 100, # maximum number of steps of size h2_step to take from h2_start
                        X_map = map, # Optional. The marker positions.
                        relmat = NULL, # A list of Kernel matrices for the random effects. If X_ID (here Geno) is not included in this list, then it is calculated as tcrossprod(Xc)/ncol(Xc) where Xc is the centered (and optionally scaled) X. If any random effects are described in `error_model` but not provided here, the Kernel is assumed to be the identity matrix
                        centerX = TRUE, # Should the markers be centered when calculating the GRM (only will be done if needed for calculating the GRM),
                        scaleX = FALSE, # Should the markers be scaled to have constant variance when calculating the GRM?
                        fillNAX = FALSE, # Should missing marker data be filled in with the mean allele frequency?
                        method = 'REML', # Should the best model be selected by REML (if False, will be selected by ML)
                        mc.cores = my_detectCores(), # How many cores should be used for parallel processing. Unless X is large, tends to actually be faster with mc.cores = 1
                        verbose = FALSE # Should progress be printed to the screen?
)

The GxE p-values are in the slot p_value_ML, or p_value_REML.2. The slot p_value_REML.1 contains the p-values for the genotype main effect.

qq-plots

u = sort(-log10(runif(nrow(gxe_gwas$results))))
plot(sort(-log10(gxe_gwas$results$p_value_REML.1))~u,pch=21,cex=.5,main = 'SNP main effect');abline(0,1)
plot(sort(-log10(gxe_gwas$results$p_value_REML.2))~u,pch=21,cex=.5,main = 'SNP gxe effect');abline(0,1)

Running a bigger GWAS

The above codes work for fairly small-scall GWAS applications. Once you get to thousands of individuals or 1 million + markers, holding the whole marker matrix X in memory can be limiting. The best strategy currently is probably to break the analysis into chunks and analyze each chunk separately. We can still re-use the pre-calculate V decompositions across all chunks to save time. Commonly, data for GWAS is in PLINK format. We show here how to load that in chunks in R.

The steps are:

  1. Estimate variance componenents using the null model. Save the pre-calculated decompositions to a folder on disk.
  2. Load the marker information data, select chunks of markers to process
  3. Load each chunk of markers separately, run GWAS
  4. Combine results together

Save example data in PLINK format

library(snpStats)

write.plink(file.base = 'test_data',snps = as(X,'SnpMatrix'),id = data$Geno,phenotype = data$y,
            chromosome = map$Chr,position = map$pos)

Estimate variance componenents using null model

This serves as a starting value for GridLMM_GWAS. We use GridLMM_ML here. Other programs like GCTA could be used and would be much faster for this step. If the marker-based kinship matrices are not prec-calculated, this needs to be done (ex. with GCTA). We do it here in R. It is possible to do this calculation in chunks as well.

K_G = tcrossprod(X)/ncol(X)

The key here is to specify a folder with save_V_folder. This way the pre-calculated matrices are saved and can be re-used.

null_model = GridLMM_ML(formula = y~1 + (1|Geno) + (1|Plot),data = data,relmat = list(Geno = K_G, Plot = K_plot),REML = T,save_V_folder = 'V_folder',tolerance = 1e-3)

From the output, we want to extract two objects: The estimated variance componenets, and an object that stores paths to the pre-calculated matrices

The variance components can be accessed as:

null_model$results[,c('Geno.REML','Plot.REML')]

The setup object is:

V_setup = null_model$setup

Load marker information matrix, divide markers into chunks

markers = read.delim('test_data.bim',header = F,row.names = NULL)
nrow(markers)

Divide into groups

chunks = split(1:nrow(markers),gl(10,nrow(markers)/10))

Run GWAS for each chunk

library(doParallel)
library(foreach)
full_results = foreach(chunk = chunks,i = 1:length(chunks),.combine = 'rbind') %do% {
  print(sprintf('Running chunk %d',i))
  X_chunk = read.plink('test_data',select.snps = chunk)
  X_chunk = as(X_chunk$genotypes,'numeric') # convert to numeric matrix
  # Run the GWAS for the chunk
  results_chunk = GridLMM_GWAS(formula = y~1 + (1|Geno) + (1|Plot),
                                    test_formula = ~1,
                                    reduced_formula = ~0,
                                    data = data,
                                    X = X_chunk,
                                    X_ID = 'Geno',
                                    relmat = list(Geno = K_G, Plot = K_plot),
                                    V_setup = V_setup,  # This tells the function to reuse existing matrices
                                    h2_start = null_model$results[,c('Geno.REML','Plot.REML')],
                                    method = 'REML',
                                    verbose = F)
  results_chunk$results # return just the results
}

Compare results - should be the same as if we had run all at once.

plot(-log10(full_results$p_value_REML),-log10(gwas$results$p_value_REML));abline(0,1)

Run GridLMM_posterior

# build genetic kinship matrix
K = tcrossprod(sweep(X,2,colMeans(X),'-'))
K = K/mean(diag(K))
posterior = GridLMM_posterior(formula = y~1 + (1|Geno) + (1|Plot),
                              data = data,
                              relmat = list(Plot = K_plot,Geno = K),
                              h2_divisions = 20)
library(ggplot2)
ggplot(posterior$h2s_results,aes(x=Geno,y=Plot)) + geom_point(aes(size = posterior)) + xlim(c(0,1)) + ylim(c(0,1))


deruncie/GridLMM documentation built on May 2, 2023, 7:18 p.m.