knitr::opts_chunk$set(echo = TRUE) library(devtools) library(ape) library(multtest) library(gplots) library(LDheatmap) library(genetics) library(EMMREML) library(compiler) #this library is already installed in R library(scatterplot3d) library(ggplot2) source("http://zzlab.net/GAPIT/gapit_functions.txt") devtools::install_github("wcrump/GWASP") library(GWASP)
The GWASP Package is available for download on Github at: https://github.com/wcrump/GWASP
The GWASP Manual is available at: https://github.com/wcrump/GWASP/blob/master/Reference_Manual.pdf
# Download Data covariate.data <- read.table("Data/CROP545_Covariates.txt", header = T) phenotype.data <- read.table("Data/CROP545_Phenotype.txt", header = T) genotype.data <- read.table("Data/mdp_numeric.txt", header = T) marker.map <- read.table("Data/mdp_SNP_information.txt", header = T)
## Perform GWAS using GWASP function ---- p.values <- GLM.func(X = genotype.data, y = phenotype.data, C = covariate.data, PCs = 3, r = 0.3) ## Create Manhattan and QQ Plots ---- p.vec <- as.vector(p.values) #transforms data type of p-values to run in manhattan plot function GWASP_manhattan <- manhattan_plot(marker_map = marker.map, pvals = p.vec) #creates manhattan plot GWASP_manhattan qq_plot(marker_map = marker.map, pvals = p.vec) #creates a QQ plot #we can see our p-values start to deviate sharply from what we'd expect to see with a uniform distribution
From our unmarked Manhattan plot, we can see that we likely have quite a few significantly associated SNPs. We can visualize this even better in a QQ plot. When compared to the red line that signifies what we'd expect to see in a uniform distribution of p-values, our actual p-values deviate significantly from our expected outcome.
## Genome-wide Threshold and Associated SNP's ---- type1= c(0.01, 0.05, 0.1, 0.2) cutoff= quantile(p.vec, type1 ,na.rm=T) #calculation of cutoff using quantiles genome.thresh <- cutoff[1] #with a type 1 error rate of 1%, we can excpect SNPs with p-values higher than this to be associated SNPs associated.SNP.index <- p.values <= genome.thresh #creates index of associated SNPs associated.SNPs <- which(associated.SNP.index) #let's us know which of our 3000 markers are our significant ones snp.names <- colnames(genotype.data[-1]) #get names of all snps associated.SNP.names <- snp.names[associated.SNPs] #get names of associated snps print(paste("The associated SNPs are ", paste(associated.SNP.names, collapse = ", "), sep = "")) #output the names of associated snps GWASP_manhattan <- manhattan_plot(marker_map = marker.map, pvals = p.vec, QTN_index = associated.SNPs) #create manhattan plot GWASP_manhattan <- GWASP_manhattan + geom_hline(yintercept = -log10(genome.thresh), linetype = "dashed", color = "black") #add significance threshold GWASP_manhattan #view manhattan plot
Now that we've calculated our genome-wide threshold for a Type I error rate of 1%, we see that we have 31 significantly associated SNPs. Just because they're significantly associated with our phenotype of interest, doesn't mean they're all equally informative. We can delve into this more by examining the minor allele frequencies of our SNPs of interest.
sig.snps.geno <- genotype.data[,associated.SNPs] #subset genotype data of only the significant SNPs minor_allele_freq <- apply(sig.snps.geno, 2, function(x) #we're applying the function(x) across all columns, designated by the 2, in our sig SNP genotype data to calculate MAF { allele_freq0 <- (sum(x == 0)*2 + sum(x == 1)) / (sum(!is.na(x)) * 2) #frequency of allele 0 is calculated, includes half of heterozygote frequency allele_freq2 <- (sum(x == 2)*2 + sum(x == 1)) / (sum(!is.na(x)) * 2) #frequency of allele 2 is calculated, includes half of heterozygote frequency return(min(allele_freq0, allele_freq2)) #now we compare allele freq 0 to allele freq 2 and return the lesser of the two }) ggplot(data = as.data.frame(minor_allele_freq), aes(x= seq_along(minor_allele_freq), y = minor_allele_freq)) + geom_point() + labs(title = "Distribution of Minor Allele Frequency of Significant SNPs", x = "SNP Index", y = "Minor Allele Frequency")
We can see that the actually allele frequency varies quite a bit between our 31 SNPs of interest. One marker even has a minor allele frequency (MAF) of 0, indicating that the alternate allele of the marker is fixed in the population. For markers that are fixed, or that have very low MAF, they represent rarer allele variants that may be less informative to our study than markers that have more common allele variants.
Using the GAPIT tool GAPIT.FDR.TypeI, we calculated 30 replicates of the false discovery rate and power of the GWASP function performing a GWAS calculation and then repeated the process with the GWASbyCor function provided by Zhiwu Zhang in CS 545 (2020). In our first comparison, we utilized the given data sets, including the phenotype data, to perform GWAS, before comparing the power and mean FDR of both methods over 30 replicates.
#we're going to use the GAPIT function GAPIT.FDR.Type1 to compare power and FDR between the two methods nrep=30 #number of replicates set.seed(89760) GWASP.Rep=replicate(nrep, { #we're using the replicate function to perform GWAS and then calculate power and FDR multiple times GWASP.sim <- GLM.func(X = genotype.data, y = phenotype.data, C = covariate.data) #perform GWAS GWASP.SNP.index <- GWASP.sim <= genome.thresh #creates index of associated SNPs GWASP.SNPs <- which(associated.SNP.index) #let's us know which of our 3000 markers are our significant ones myGWAS <- cbind(marker.map,t(GWASP.sim),NA) #appends our GWAS results to our marker map into a single data frame #calculates power and FDR using the GAPIT.FDR.TypeI function, most parameters kept to default myStat <- GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM = marker.map, seqQTN = GWASP.SNPs, GWAS = myGWAS, maxOut = 100,MaxBP=1e10) }) set.seed(89761) GbyCor.Rep <- replicate(nrep, { #we can use the same replicate function structure to calculate FDR and power for GWASbyCor GbyCor.sim <- GWASbyCor(X = genotype.data[,-1], y = phenotype.data[,-1]) GbyCor.SNP.index <- GbyCor.sim <= genome.thresh #creates index of associated SNPs GbyCor.SNPs <- which(associated.SNP.index) #let's us know which of our 3000 markers are our significant ones myGWAS <- cbind(marker.map, t(GbyCor.sim), NA) mySTAT <- GAPIT.FDR.TypeI(WS = c(1e0, 1e3, 1e4, 1e5), GM = marker.map, seqQTN = GbyCor.SNPs, GWAS = myGWAS, maxOut = 100, MaxBP = 1e10) }) power.GWASP <- GWASP.Rep[[2]] #pulling out the power of the function for use in plotting power.GbyCor <- GbyCor.Rep[[2]] #pulling out the power of the function for use in plotting #FDR gwasp.fdr.seq <- seq(3,length(GWASP.Rep),7) #creates index of where FDR values are stored of every replicate gwasp.fdr <- GWASP.Rep[gwasp.fdr.seq] #pulls all FDR outputs into a new list of just FDR data gwasp.fdr.mean <- Reduce ("+", gwasp.fdr) / length(gwasp.fdr) #calculates the FDR mean for each individual replicate gbycor.fdr.seq <- seq(3, length(GbyCor.Rep), 7) gbycor.fdr <- GbyCor.Rep[gbycor.fdr.seq] gbycor.fdr.mean <- Reduce ("+", gbycor.fdr) / length(gbycor.fdr) theColor=rainbow(4) #creates color vector plot(gwasp.fdr.mean[,1],power.GWASP , main = "GWASP Power vs. FDR", type="b", col=theColor [1],xlim=c(0,1)) #plots GWASP FDR means against power for(i in 2:ncol(gwasp.fdr.mean)){ #shows different 'resolutions' as different colors lines(gwasp.fdr.mean[,i], power.GWASP , type="b", col= theColor [i]) } plot(gbycor.fdr.mean[,1],power.GbyCor , main = "GWASbyCor Power vs FDR", type="b", col=theColor [1],xlim=c(0,1)) #plots GbyCor FDR means against power for(i in 2:ncol(gbycor.fdr.mean)){ #shows different 'resolutions' as different colors lines(gbycor.fdr.mean[,i], power.GbyCor , type="b", col= theColor [i]) }
When comparing the two plots, it's clear that the GWASP package has a lower false discovery rate as the power of the test increases when compared to the GWASbyCor function. This indicates that GWASP is superior in that it has a lower likelihood of a Type I error, even as the test power increases.
We also chose to compare how simulated phenotype data would change the comparison between the two functions. Using simulated phenotype data from the G2P function, we again performed GWAS with both functions, and then compared the FDR of both functions with different "bin" sizes, which represent genotype sequencing resolution.
nrep=30 set.seed(89763) GWASP.Rep=replicate(nrep, { #we're using the replicate function to perform GWAS and then calculate power and FDR multiple times phenotype <- G2P(X=genotype.data[,-1], alpha=1, NQTN=10) GWASP.sim <- GLM.func(X = genotype.data, y = phenotype$y, C = covariate.data) #perform GWAS myGWAS <- cbind(marker.map,t(GWASP.sim),NA) #appends our GWAS results to our marker map into a single data frame #calculates power and FDR using the GAPIT.FDR.TypeI function, most parameters kept to default myStat.gwasp <- GAPIT.FDR.TypeI(WS=c(1e0,1e3,1e4,1e5), GM = marker.map, seqQTN = phenotype$QTN.position, GWAS = myGWAS, maxOut = 100, MaxBP = 1e10) }) set.seed(89763) GbyCor.Rep <- replicate(nrep, { #we can use the same replicate function structure to calculate FDR and power for GWASbyCor phenotype <- G2P(X=genotype.data[,-1], alpha=1, NQTN=10) GbyCor.sim <- GWASbyCor(X = genotype.data[,-1], y = phenotype$y) myGWAS <- cbind(marker.map, t(GbyCor.sim), NA) mySTAT <- GAPIT.FDR.TypeI(WS = c(1e0, 1e3, 1e4, 1e5), GM = marker.map, seqQTN = phenotype$QTN.position, GWAS = myGWAS, maxOut = 100, MaxBP = 1e10) }) power.GWASP <- GWASP.Rep[[2]] #pulling out the power of the function for use in plotting power.GbyCor <- GbyCor.Rep[[2]] #pulling out the power of the function for use in plotting #FDR gwasp.fdr.seq <- seq(3,length(GWASP.Rep),7) #creates index of where FDR values are stored of every replicate gwasp.fdr <- GWASP.Rep[gwasp.fdr.seq] #pulls all FDR outputs into a new list of just FDR data gwasp.fdr.mean <- Reduce ("+", gwasp.fdr) / length(gwasp.fdr) #calculates the FDR mean for each individual replicate gbycor.fdr.seq <- seq(3, length(GbyCor.Rep), 7) gbycor.fdr <- GbyCor.Rep[gbycor.fdr.seq] gbycor.fdr.mean <- Reduce ("+", gbycor.fdr) / length(gbycor.fdr) #power x fdr curves for gwasp and gbycor for each bin size bin.size <- c(1e0, 1e3, 1e4, 1e5) for (i in 1:ncol(gwasp.fdr.mean)){ plot(gwasp.fdr.mean[,i], power.GWASP, type="b", col="blue", xlim=c(0,1), xlab="FDR", ylab="Power", main=paste("Power vs FDR for Bin Size ", bin.size[i])) lines(gbycor.fdr.mean[,i], power.GbyCor, type="b", col="red") # Add a legend legend("topleft", legend = c("GWASP", "GWASbyCor"), text.col = c("blue", "Red")) }
At most given FDRs, GWASP has more power than GWAS by correlation.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.