In wcrump/GWASP: GWAS with PCA

knitr::opts_chunk$set(echo = TRUE)
library(devtools)
library(ape)
library(multtest)
library(gplots)
library(LDheatmap)
library(genetics)
library(EMMREML)
library(compiler) #this library is already installed in R
library(scatterplot3d)
library(ggplot2)

source("http://zzlab.net/GAPIT/gapit_functions.txt")

devtools::install_github("wcrump/GWASP")
library(GWASP)

GWASP Package

The GWASP Package is available for download on Github at: https://github.com/wcrump/GWASP

The GWASP Manual is available at: https://github.com/wcrump/GWASP/blob/master/Reference_Manual.pdf

Problem 4: Perform GWAS on the data provided.

# Download Data
covariate.data <- read.table("Data/CROP545_Covariates.txt", header = T)
phenotype.data <- read.table("Data/CROP545_Phenotype.txt", header = T)
genotype.data <- read.table("Data/mdp_numeric.txt", header = T)
marker.map <- read.table("Data/mdp_SNP_information.txt", header = T)

## Perform GWAS using GWASP function ----
p.values <- GLM.func(X = genotype.data, y = phenotype.data, C = covariate.data, PCs = 3, r = 0.3)

## Create Manhattan and QQ Plots ----
p.vec <- as.vector(p.values) #transforms data type of p-values to run in manhattan plot function
GWASP_manhattan <- manhattan_plot(marker_map = marker.map, pvals = p.vec) #creates manhattan plot
GWASP_manhattan

qq_plot(marker_map = marker.map, pvals = p.vec) #creates a QQ plot
#we can see our p-values start to deviate sharply from what we'd expect to see with a uniform distribution

From our unmarked Manhattan plot, we can see that we likely have quite a few significantly associated SNPs. We can visualize this even better in a QQ plot. When compared to the red line that signifies what we'd expect to see in a uniform distribution of p-values, our actual p-values deviate significantly from our expected outcome.

## Genome-wide Threshold and Associated SNP's ----
type1= c(0.01, 0.05, 0.1, 0.2)
cutoff= quantile(p.vec, type1 ,na.rm=T) #calculation of cutoff using quantiles
genome.thresh <- cutoff[1] #with a type 1 error rate of 1%, we can excpect SNPs with p-values higher than this to be associated SNPs

associated.SNP.index <- p.values <= genome.thresh #creates index of associated SNPs
associated.SNPs <- which(associated.SNP.index) #let's us know which of our 3000 markers are our significant ones

snp.names <- colnames(genotype.data[-1]) #get names of all snps
associated.SNP.names <- snp.names[associated.SNPs] #get names of associated snps
print(paste("The associated SNPs are ", paste(associated.SNP.names, collapse = ", "), sep = "")) #output the names of associated snps

GWASP_manhattan <- manhattan_plot(marker_map = marker.map, pvals = p.vec, QTN_index = associated.SNPs) #create manhattan plot
GWASP_manhattan <- GWASP_manhattan + geom_hline(yintercept = -log10(genome.thresh), linetype = "dashed", color = "black") #add significance threshold
GWASP_manhattan #view manhattan plot

Now that we've calculated our genome-wide threshold for a Type I error rate of 1%, we see that we have 31 significantly associated SNPs. Just because they're significantly associated with our phenotype of interest, doesn't mean they're all equally informative. We can delve into this more by examining the minor allele frequencies of our SNPs of interest.

sig.snps.geno <- genotype.data[,associated.SNPs] #subset genotype data of only the significant SNPs

minor_allele_freq <- apply(sig.snps.geno, 2, function(x) #we're applying the function(x) across all columns, designated by the 2, in our sig SNP genotype data to calculate MAF
{
    allele_freq0 <- (sum(x == 0)*2 + sum(x == 1)) / (sum(!is.na(x)) * 2) #frequency of allele 0 is calculated, includes half of heterozygote frequency
    allele_freq2 <- (sum(x == 2)*2 + sum(x == 1)) / (sum(!is.na(x)) * 2) #frequency of allele 2 is calculated, includes half of heterozygote frequency
    return(min(allele_freq0, allele_freq2)) #now we compare allele freq 0 to allele freq 2 and return the lesser of the two
})

ggplot(data = as.data.frame(minor_allele_freq), aes(x= seq_along(minor_allele_freq), y = minor_allele_freq)) + geom_point() +
    labs(title = "Distribution of Minor Allele Frequency of Significant SNPs", x = "SNP Index", y = "Minor Allele Frequency")

We can see that the actually allele frequency varies quite a bit between our 31 SNPs of interest. One marker even has a minor allele frequency (MAF) of 0, indicating that the alternate allele of the marker is fixed in the population. For markers that are fixed, or that have very low MAF, they represent rarer allele variants that may be less informative to our study than markers that have more common allele variants.

Problem 5: GWASP vs GWASbyCor

Using the GAPIT tool GAPIT.FDR.TypeI, we calculated 30 replicates of the false discovery rate and power of the GWASP function performing a GWAS calculation and then repeated the process with the GWASbyCor function provided by Zhiwu Zhang in CS 545 (2020). In our first comparison, we utilized the given data sets, including the phenotype data, to perform GWAS, before comparing the power and mean FDR of both methods over 30 replicates.

#we're going to use the GAPIT function GAPIT.FDR.Type1 to compare power and FDR between the two methods
nrep=30 #number of replicates
set.seed(89760)
GWASP.Rep=replicate(nrep, { #we're using the replicate function to perform GWAS and then calculate power and FDR multiple times
    GWASP.sim <- GLM.func(X = genotype.data, y = phenotype.data, C = covariate.data) #perform GWAS
    GWASP.SNP.index <- GWASP.sim <= genome.thresh #creates index of associated SNPs
    GWASP.SNPs <- which(associated.SNP.index) #let's us know which of our 3000 markers are our significant ones
    myGWAS <- cbind(marker.map,t(GWASP.sim),NA) #appends our GWAS results to our marker map into a single data frame
    #calculates power and FDR using the GAPIT.FDR.TypeI function, most parameters kept to default
    myStat <- GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM = marker.map, seqQTN = GWASP.SNPs, GWAS = myGWAS, maxOut = 100,MaxBP=1e10)
})

set.seed(89761)
GbyCor.Rep <- replicate(nrep, { #we can use the same replicate function structure to calculate FDR and power for GWASbyCor
    GbyCor.sim <- GWASbyCor(X = genotype.data[,-1], y = phenotype.data[,-1])
    GbyCor.SNP.index <- GbyCor.sim <= genome.thresh #creates index of associated SNPs
    GbyCor.SNPs <- which(associated.SNP.index) #let's us know which of our 3000 markers are our significant ones
    myGWAS <- cbind(marker.map, t(GbyCor.sim), NA)
    mySTAT <- GAPIT.FDR.TypeI(WS = c(1e0, 1e3, 1e4, 1e5), GM = marker.map, seqQTN = GbyCor.SNPs, GWAS = myGWAS, maxOut = 100, MaxBP = 1e10)
})

power.GWASP <- GWASP.Rep[[2]] #pulling out the power of the function for use in plotting
power.GbyCor <- GbyCor.Rep[[2]] #pulling out the power of the function for use in plotting

#FDR
gwasp.fdr.seq <- seq(3,length(GWASP.Rep),7) #creates index of where FDR values are stored of every replicate
gwasp.fdr <- GWASP.Rep[gwasp.fdr.seq] #pulls all FDR outputs into a new list of just FDR data
gwasp.fdr.mean <- Reduce ("+", gwasp.fdr) / length(gwasp.fdr) #calculates the FDR mean for each individual replicate

gbycor.fdr.seq <- seq(3, length(GbyCor.Rep), 7)
gbycor.fdr <- GbyCor.Rep[gbycor.fdr.seq]
gbycor.fdr.mean <- Reduce ("+", gbycor.fdr) / length(gbycor.fdr)

theColor=rainbow(4) #creates color vector

plot(gwasp.fdr.mean[,1],power.GWASP , main = "GWASP Power vs. FDR", type="b", col=theColor [1],xlim=c(0,1)) #plots GWASP FDR means against power
for(i in 2:ncol(gwasp.fdr.mean)){ #shows different 'resolutions' as different colors
    lines(gwasp.fdr.mean[,i], power.GWASP , type="b", col= theColor [i])
}

plot(gbycor.fdr.mean[,1],power.GbyCor , main = "GWASbyCor Power vs FDR", type="b", col=theColor [1],xlim=c(0,1)) #plots GbyCor FDR means against power
for(i in 2:ncol(gbycor.fdr.mean)){ #shows different 'resolutions' as different colors
    lines(gbycor.fdr.mean[,i], power.GbyCor , type="b", col= theColor [i])
}

When comparing the two plots, it's clear that the GWASP package has a lower false discovery rate as the power of the test increases when compared to the GWASbyCor function. This indicates that GWASP is superior in that it has a lower likelihood of a Type I error, even as the test power increases.

Use Simulated Phenotype Data to compare GWASP with GWASbyCor

We also chose to compare how simulated phenotype data would change the comparison between the two functions. Using simulated phenotype data from the G2P function, we again performed GWAS with both functions, and then compared the FDR of both functions with different "bin" sizes, which represent genotype sequencing resolution.

nrep=30
set.seed(89763)
GWASP.Rep=replicate(nrep, { #we're using the replicate function to perform GWAS and then calculate power and FDR multiple times
    phenotype <- G2P(X=genotype.data[,-1], alpha=1, NQTN=10)
    GWASP.sim <- GLM.func(X = genotype.data, y = phenotype$y, C = covariate.data) #perform GWAS
    myGWAS <- cbind(marker.map,t(GWASP.sim),NA) #appends our GWAS results to our marker map into a single data frame
    #calculates power and FDR using the GAPIT.FDR.TypeI function, most parameters kept to default
    myStat.gwasp <- GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM = marker.map, seqQTN = phenotype$QTN.position, GWAS = myGWAS, maxOut = 100, MaxBP = 1e10)
})


set.seed(89763)
GbyCor.Rep <- replicate(nrep, { #we can use the same replicate function structure to calculate FDR and power for GWASbyCor
    phenotype <- G2P(X=genotype.data[,-1], alpha=1, NQTN=10)
    GbyCor.sim <- GWASbyCor(X = genotype.data[,-1], y = phenotype$y)
    myGWAS <- cbind(marker.map, t(GbyCor.sim), NA)
    mySTAT <- GAPIT.FDR.TypeI(WS = c(1e0, 1e3, 1e4, 1e5), GM = marker.map, seqQTN = phenotype$QTN.position, GWAS = myGWAS, maxOut = 100, MaxBP = 1e10)
})


power.GWASP <- GWASP.Rep[[2]] #pulling out the power of the function for use in plotting
power.GbyCor <- GbyCor.Rep[[2]] #pulling out the power of the function for use in plotting

#FDR
gwasp.fdr.seq <- seq(3,length(GWASP.Rep),7) #creates index of where FDR values are stored of every replicate
gwasp.fdr <- GWASP.Rep[gwasp.fdr.seq] #pulls all FDR outputs into a new list of just FDR data
gwasp.fdr.mean <- Reduce ("+", gwasp.fdr) / length(gwasp.fdr) #calculates the FDR mean for each individual replicate

gbycor.fdr.seq <- seq(3, length(GbyCor.Rep), 7)
gbycor.fdr <- GbyCor.Rep[gbycor.fdr.seq]
gbycor.fdr.mean <- Reduce ("+", gbycor.fdr) / length(gbycor.fdr)

#power x fdr curves for gwasp and gbycor for each bin size
bin.size <- c(1e0, 1e3, 1e4, 1e5)
for (i in 1:ncol(gwasp.fdr.mean)){
    plot(gwasp.fdr.mean[,i], power.GWASP, type="b", col="blue", xlim=c(0,1), xlab="FDR", ylab="Power", main=paste("Power vs FDR for Bin Size ", bin.size[i]))
    lines(gbycor.fdr.mean[,i], power.GbyCor, type="b", col="red")
    # Add a legend
legend("topleft", 
  legend = c("GWASP", "GWASbyCor"), 
  text.col = c("blue", "Red"))
}