if (!file.exists("GAPIT_Tutorial_Data.zip"))
{
  download.file("http://zzlab.net/GAPIT/GAPIT_Tutorial_Data.zip", destfile = "GAPIT_Tutorial_Data.zip")
  unzip("GAPIT_Tutorial_Data.zip")
}
download.file("http://zzlab.net/GAPIT/data/CROP545_Covariates.txt", destfile = "CROPS545_Covariates.txt")
download.file("http://zzlab.net/GAPIT/data/CROP545_Phenotype.txt", destfile = "CROPS545_Phenotype.txt")
# Import the GAPIT demo data genotypes
gt_scan <- data.frame(read.table("GAPIT_Tutorial_Data/mdp_numeric.txt", header = T, stringsAsFactors = F, sep = "\t", nrows = 1))
classes <- sapply(gt_scan, class)
genotypes <- data.frame(read.table("GAPIT_Tutorial_Data/mdp_numeric.txt", header = T, row.names = 1, colClasses = classes, stringsAsFactors = F, sep = "\t"))

GM <- read.table("GAPIT_Tutorial_Data/mdp_SNP_information.txt", header = T, stringsAsFactors = F, sep = "\t")
CV <- read.table("CROPS545_Covariates.txt", header = T, stringsAsFactors = F, sep = "\t")
phenotypes <- read.table("CROPS545_Phenotype.txt", header = T, stringsAsFactors = F, sep = "\t")


#Load dependencies
if (!require("pacman")) install.packages("pacman")
pacman::p_load(ggplot2,knitr,gridExtra,kableExtra,data.table)


#source("R/Function_qqplot.R")
#source("R/Function_manhattan_QTN.R")
#source("R/Function_manhattan.R")
#source("R/FUNCTION_powerFDRtype1error.R")

Question 3

Demo with QTN

#Create QTNs
n=nrow(genotypes)
m=ncol(genotypes)
NQTN=10
#Show the positions of the QTN
QTN.position=sample(m,NQTN,replace=F)
phenotypes_n_1=phenotypes[,2]
test=GWASapply(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3,QTN.position,plots=TRUE,messages=TRUE)
hist(test$power.fdr.type1error.res$power)

Demo without QTN

phenotypes_n_1=phenotypes[,2]
loop_start <- proc.time()
test=GWASapply_rapid(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3)
loop_end <- proc.time()
loop_elapsed <- loop_end[3] - loop_start[3]

timet=c()
garep=replicate(30,{
loop_start <- proc.time()
test=GWASapply_rapid(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3)
loop_end <- proc.time()
loop_elapsed <- loop_end[3] - loop_start[3]
timet=c(time,loop_elapsed)
  })
mean(garep)
manhattan_plot(GM,test$P.value.res,cutoff=NULL, trait = "unknown",messages=TRUE)

Demo no plot

phenotypes_n_1=phenotypes[,2]
test=GWASapply(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3,plots=FALSE,messages=TRUE)

Demo no plot nor CV

phenotypes_n_1=phenotypes[,2]
test=GWASapply(phenotypes_n_1,genotypes,GM=GM,PCA.M = 3,plot=FALSE,messages=TRUE)

Demo no plot nor CV

phenotypes_n_1=phenotypes[,2]
test=GWASapply(phenotypes_n_1,genotypes,GM=GM,PCA.M = 3,plot=FALSE,messages=TRUE)

Demo with all parameters, plots and messages and returns a csv, specified trait name

phenotypes_n_1=phenotypes[,2]
test=GWASapply(mySim$y,genotypes,CV,GM,PCA.M = 3,mySim$QTN.position,plot=TRUE,messages=TRUE,print=TRUE,trait="height")
test$power.fdr.type1error.res

Demo with all parameters, plots and messages and returns a csv, with a specified cutoff

phenotypes_n_1=phenotypes[,2]
mySim=G2P(genotypes, h2, alpha, NQTN, distribution)
test=GWASapply(mySim$y,genotypes,CV,GM,PCA.M = 3,mySim$QTN.position,plot=TRUE,messages=TRUE,print=FALSE,cutoff = NULL)
test$power.fdr.type1error.res$FP.fdr.power
test$power.fdr.type1error.res$TP.fdr.power
hist(test$power.fdr.type1error.res$power)

Question 5

Simulated Data

Manhattan Plots

source("http://www.zzlab.net/StaGen/2020/R/G2P.R")
source("http://www.zzlab.net/StaGen/2020/R/GWASbyCor.R")
NQTN = 10 #specify number of QTN
h2 = 0.75 #heritability
alpha = 0.6 #alpha value
distribution = "normal" #specify distribution
#set.seed(1337)
#plotting
  mySim=G2P(genotypes, h2, alpha, NQTN, distribution) #phenotype simulation
  p= GWASbyCor(X=genotypes,y=mySim$y)#GWAS by using the correlation methods
  p=data.frame(p)
  pres=power.fdr(p, QTN.position=mySim$QTN.position,cutoff=NULL)
  gcm=manhattan_plot(GM,p,cutoff=NULL,mySim$QTN.position,FP=pres$FP.fdr.power,TP=pres$TP.fdr.power, trait = "unknown",messages=TRUE)
  gcq=qq_plot(GM,p,mySim$QTN.position, trait = "unknown")
  test=GWASapply(mySim$y,genotypes,CV,GM,PCA.M = 3,QTN.position=mySim$QTN.position)
  gtm=manhattan_plot(GM,test$P.value.res,cutoff=NULL,mySim$QTN.position,FP=test$False.Positive,TP=test$True.Positive, trait = "unknown",messages=TRUE)
  gtq=qq_plot(GM,test$P.value.res,mySim$QTN.position, trait = "unknown")
  grid.arrange(gcm, gtm, nrow = 2, ncol = 1, top = "Variance of Principal Components")
  grid.arrange(gcq,gtq, nrow = 2, ncol = 1, top = "Variance of Principal Components")

Mean True and False Positives

gcrep=replicate (30,{
  mySim=G2P(genotypes, h2, alpha, NQTN, distribution) #phenotype simulation
  p= GWASbyCor(X=genotypes,y=mySim$y)#GWAS by using the correlation methods
  p=data.frame(p)
  pres=power.fdr(p, QTN.position=mySim$QTN.position,cutoff=NULL)
  FalP=length(pres$FP.fdr.power)
  TruP=length(pres$TP.fdr.power)
  num_output=c()
  num_output=c(FalP,TruP)
  })
View(gcrep)
print(rowMeans(gcrep))
# 5.16 (# QTN in top 10 p-value SNPs)  211.59 (# QTN with p-value lower than 7th highest)
print(apply(gcrep,1,sd))
garep=replicate(30,{
  mySim=G2P(genotypes, h2, alpha, NQTN, distribution)
  test_rep=GWASapply(mySim$y,genotypes,CV,GM,PCA.M = 3,QTN.position=mySim$QTN.position)
  FalP=length(test_rep$False.Positive)
  TruP=length(test_rep$True.Positive)
  num_output=c()
  num_output=c(FalP,TruP)
  })
print(rowMeans(garep))
# 5.16 (# QTN in top 10 p-value SNPs)  211.59 (# QTN with p-value lower than 7th highest)
print(apply(garep,1,sd))
h2 <- c(0.25, 0.5, 0.75, 1)
NQTN <- c(2, 10, 100)
rept <- 100 #replicate time
nsnp <- length(genotypes[1,])
for(i in 1:length(h2)){
  for(j in 1:length(NQTN)){
    assign(paste("h2.", i, ".NQTN.", j, ".GLM.power", sep=""), matrix(NA, nrow=nsnp, ncol=0))
    assign(paste("h2.", i, ".NQTN.", j, ".GLM.fdr", sep=""), matrix(NA, nrow=nsnp, ncol=0))

    assign(paste("h2.", i, ".NQTN.", j, ".Cor.power", sep=""), matrix(NA, nrow=nsnp, ncol=0))
    assign(paste("h2.", i, ".NQTN.", j, ".Cor.fdr", sep=""), matrix(NA, nrow=nsnp, ncol=0))

    for(r in 1:rept){
      mySim <- G2P(X=X1to5, h2=h2[i], alpha=1, NQTN=NQTN[j], distribution="normal")

      ###GWAS by GLM
      myGLM <- GWASapply(mySim$y,genotypes,CV,GM,PCA.M = 3,QTN.position=mySim$QTN.position)
      GLM.power <- get(paste("h2.", i, ".NQTN.", j, ".GLM.power", sep=""))
      GLM.power <- cbind(GLM.power, as.vector(myGLM$power.fdr.type1error$power))

      GLM.fdr <- get(paste("h2.", i, ".NQTN.", j, ".GLM.fdr", sep=""))
      GLM.fdr <- cbind(GLM.fdr, myGLM$power.fdr.type1error$fdr)

      assign(paste("h2.", i, ".NQTN.", j, ".GLM.power", sep=""), GLM.power)
      assign(paste("h2.", i, ".NQTN.", j, ".GLM.fdr", sep=""), GLM.fdr)

      ###GWASbyCor
      myP <- GWASbyCor(genotypes, mySim$y)
      order.SNP <- order(myP)
      Cor.pwrfdr <- power.fdr(order.SNP, mySim$QTN.position)

      Cor.power <- get(paste("h2.", i, ".NQTN.", j, ".Cor.power", sep=""))
      Cor.power <- cbind(Cor.power, Cor.pwrfdr$power)

      Cor.fdr <- get(paste("h2.", i, ".NQTN.", j, ".Cor.fdr", sep=""))
      Cor.fdr <- cbind(Cor.fdr, Cor.pwrfdr$fdr)

      assign(paste("h2.", i, ".NQTN.", j, ".Cor.power", sep=""), Cor.power)
      assign(paste("h2.", i, ".NQTN.", j, ".Cor.fdr", sep=""), Cor.fdr)
    }

  }
}  
par(mfrow=c(4, 3))
for(i in 1:length(h2)){
  for(j in 1:length(NQTN)){

    GLM.power <- get(paste("h2.", i, ".NQTN.", j, ".GLM.power", sep=""))
    GLM.fdr <- get(paste("h2.", i, ".NQTN.", j, ".GLM.fdr", sep=""))
    GLM.Y <- rowMeans(GLM.power)
    GLM.X <- rowMeans(GLM.fdr)

    Cor.power <- get(paste("h2.", i, ".NQTN.", j, ".Cor.power", sep=""))
    Cor.fdr <- get(paste("h2.", i, ".NQTN.", j, ".Cor.fdr", sep=""))
    Cor.Y <- rowMeans(Cor.power)
    Cor.X <- rowMeans(Cor.fdr)

    plot(GLM.X, GLM.Y, type="b", pch=1, col="red", xlim=c(0,1), ylim=c(0,1), xlab="fdr", ylab="power", main=paste("h2=", h2[i], "  NQTN=", NQTN[j], sep=""))
    lines(Cor.Y~Cor.X, type="b", pch=2, col="blue")
    legend("topleft", c("GLM", "Cor"), col=c("red", "blue"), lty=1, pch=c(1, 2), cex=0.8, bty="n")
  }
}
source("http://www.zzlab.net/StaGen/2020/R/G2P.R")
source("http://www.zzlab.net/StaGen/2020/R/GWASbyCor.R")
NQTN = 10 #specify number of QTN
h2 = 0.75 #heritability
alpha = 0.6 #alpha value
distribution = "normal" #specify distribution
#set.seed(1337)
#plotting
  mySim=G2P(genotypes, h2, alpha, NQTN, distribution) #phenotype simulation
  p= as.vector(GWASbyCor(X=genotypes,y=phenotypes_n_1))
  p= GWASbyCor(X=genotypes,y=phenotypes_n_1)#GWAS by using the correlation methods
  p=data.frame(p)
  pres=power.fdr(p, QTN.position=QTN.position,cutoff=NULL)
  manhattan_plot(GM,p,cutoff=NULL,QTN.position,FP=pres$FP.fdr.power,TP=pres$TP.fdr.power, trait = "unknown",messages=TRUE)
  qq_plot(GM,p,QTN.position, trait = "unknown")
#Replicating  
gcrep=replicate (30,{
  mySim=G2P(genotypes, h2, alpha, NQTN, distribution) #phenotype simulation
  p= as.vector(GWASbyCor(X=genotypes,y=phenotypes_n_1))
  p= GWASbyCor(X=genotypes,y=phenotypes_n_1)#GWAS by using the correlation methods
  p=data.frame(p)
  pres=power.fdr(p, QTN.position=QTN.position,cutoff=NULL)
  FalP=length(pres$FP.fdr.power)
  TruP=length(pres$TP.fdr.power)
  num_output=c()
  num_output=c(FalP,TruP)
  })
num_output=c()
for(i in 1:30){
  mySim=G2P(genotypes, h2, alpha, NQTN, distribution) #phenotype simulation
  p= as.vector(GWASbyCor(X=genotypes,y=phenotypes_n_1))
  p= GWASbyCor(X=genotypes,y=phenotypes_n_1)#GWAS by using the correlation methods
  p=data.frame(p)
  pres=power.fdr(p, QTN.position=QTN.position,cutoff=NULL)
  FalP=length(pres$FP.fdr.power)
  TruP=length(pres$TP.fdr.power)
  num_output=rbind(num_output,FalP,TruP)

}

View(gcrep)
print(rowMeans(gcrep))
# 5.16 (# QTN in top 10 p-value SNPs)  211.59 (# QTN with p-value lower than 7th highest)
print(apply(gcrep,1,sd))
num_output
View(num_output)
View(gcrep)
means=mean(gcrep)
stdevs=sd(gcrep)
out4 <- rbind(means,stdevs)
out4 <- as.data.frame(out4)
colnames(out4) <- c("QTNs in the Top 10")
rownames(out4)<-c("Averages","Standard Deviations")
kable(out4) 
NQTN = 10 #specify number of QTN
h2 = 0.75 #heritability
alpha = 0.6 #alpha value
distribution = "normal" #specify distribution
set.seed(1337)
n=nrow(genotypes)
m=ncol(genotypes)
NQTN=10
#Show the positions of the QTN
QTN.position=sample(m,NQTN,replace=F)

garep=replicate(30,{
  mySim=G2P(genotypes, h2, alpha, NQTN, distribution)
  test_rep=GWASapply(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3,QTN.position=QTN.position)
  FalP=length(test_rep$False.Positive)
  TruP=length(test_rep$True.Positive)
  num_output=c()
  num_output=c(FalP,TruP)
  })
print(rowMeans(garep))
# 5.16 (# QTN in top 10 p-value SNPs)  211.59 (# QTN with p-value lower than 7th highest)
print(apply(garep,1,sd))


means=mean(garep)
stdevs=sd(garep)
out5 <- rbind(means,stdevs)
out5 <- as.data.frame(out5)
colnames(out5) <- c("True Positives")
rownames(out5)<-c("Averages","Standard Deviations")
kable(out5) 

T10r=index[1:10]
  detected=intersect(T10r,mySim$QTN.position)#find the position of top10 detected QTN
  N_detected= length(detected) # the number of QTNs among top ten associated SNPs
  N_output=c()
  N_output=c(N_detected)#combine the output into matrix
mean(test_rep$power.fdr.type1error.res$power)  
mean(test_rep$power.fdr.type1error.res$fdr) 
mean(test_rep$power.fdr.type1error.res$type1error) 
str(test_rep$P.value.res)
test_c=GWASapply(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3,QTN.position=QTN.position, plot=FALSE,cutoff = 0.05)
test_c$True.Positive
test_c$False.Positive
test_rep$False.Positive
pdr=power.fdr(test_rep$order.SNP.res,QTN.position)
tes=fdr(test$P.value.res)
tes
mean(test$power.fdr.type1error.res$fdr)
fal.neg(test_rep$P.value.res,QTN.position,test_rep$cutoff.final.res)

This code can be used for The bonus question

Time Difference between GWAS with apply vs loop

#Time BLINK
loop_start <- proc.time()
test=GWAStest(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3)
loop_end <- proc.time()
loop_elapsed <- loop_end[3] - loop_start[3]

#Time our function
apply_start <- proc.time()
test=GWASapply(phenotypes_n_1,genotypes,CV,GM,PCA.M = 3,QTN.position,plot=FALSE,messages=FALSE,print=FALSE,trait="height")
apply_end <- proc.time()
apply_elapsed <- apply_end[3] - apply_start[3]
apply_elapsed
# Consolidate times into a neat table
time_table <- data.frame(c("loop", "apply"), c(loop_elapsed, apply_elapsed))
names(time_table) <- c("Method", "Time(s)")
kable(time_table) %>%
  kable_styling(bootstrap_options = "striped", full_width = F)

# Calculate the percent improvement
perc_diff <- round((loop_elapsed - apply_elapsed) / loop_elapsed * 100,2)

print(paste("Apply is ", perc_diff , "% faster!"))


stp4freedom/HW4_GLM_GWAS documentation built on March 29, 2020, 6:49 p.m.